def construct_folders(self): """Create the folder structure""" """ new_dataset_id └── TRAIN ├── dataset_TRAIN │ ├── datasetDoc.json │ └── tables XXX │ └── learningData.csv └── problem_TRAIN XXX ├── dataSplits.csv └── problemDoc.json """ if self.has_error(): return False d3m_config = self.user_workspace.d3m_config #et_latest_d3m_config() if not d3m_config: user_msg = 'Latest D3M configuration not found. (construct_folders)' self.send_websocket_err_msg(user_msg) return False if (not d3m_config.additional_inputs) or \ (not isdir(d3m_config.additional_inputs)): user_msg = ('Additional inputs folder does not exist! %s') % \ (d3m_config.additional_inputs,) self.send_websocket_err_msg(user_msg) return False # --------------------------------------- # Create the problem_TRAIN # --------------------------------------- self.dataset_root_dir = join(d3m_config.additional_inputs, self.dataset_id) self.problem_dir = join(self.dataset_root_dir, 'TRAIN', 'problem_TRAIN') LOGGER.info(' - dataset_root_dir: %s', self.dataset_root_dir) LOGGER.info(' - problem_dir: %s', self.problem_dir) dir_info = create_directory(self.problem_dir) if not dir_info.success: self.send_websocket_err_msg(dir_info.err_msg) return False # --------------------------------------- # Create the tables dir # --------------------------------------- self.tables_dir = join(self.dataset_root_dir, 'TRAIN', 'dataset_TRAIN', 'tables') LOGGER.info(' - tables_dir: %s', self.tables_dir) dir_info = create_directory(self.tables_dir) if not dir_info.success: self.send_websocket_err_msg(dir_info.err_msg) return False self.dataset_dir = dirname(self.tables_dir) return True
def construct_folders(self): """Create the folder structure + D3MConfig object""" """ new_dataset_id └── TRAIN ├── dataset_TRAIN │ ├── datasetDoc.json │ └── tables XXX │ └── learningData.csv └── problem_TRAIN XXX ├── dataSplits.csv └── problemDoc.json """ if self.has_error(): return False if not isdir(self.writable_output_dir): user_msg = f'UserDatasetUtil. This directory does not exist: {self.writable_output_dir}' self.send_websocket_err_msg(user_msg) return False self.dataset_id = slugify(self.dataset_name[:15] + '-' + get_alpha_string(4)) self.dataset_root_dir = join(self.writable_output_dir, self.dataset_id) # --------------------------------------- # Create the problem_TRAIN directory # --------------------------------------- self.problem_dir = join(self.dataset_root_dir, 'TRAIN', 'problem_TRAIN') LOGGER.info(' - dataset_root_dir: %s', self.dataset_root_dir) LOGGER.info(' - problem_dir: %s', self.problem_dir) dir_info = create_directory(self.problem_dir) if not dir_info.success: self.send_websocket_err_msg(dir_info.err_msg) return False # --------------------------------------- # Create the tables dir # --------------------------------------- self.dataset_tables_dir = join(self.dataset_root_dir, 'TRAIN', 'dataset_TRAIN', 'tables') dir_info = create_directory(self.dataset_tables_dir) if not dir_info.success: self.send_websocket_err_msg(dir_info.err_msg) return False self.dataset_dir = dirname(self.dataset_tables_dir) return True
def check_build_output_directories(d3m_config): """Used when setting a new a d3m_config: - check if the output directories exist - build them if they don't""" if not isinstance(d3m_config, D3MConfiguration): return err_resp('d3m_config must be a D3MConfiguration object') temp_path = None output_path = d3m_config.env_values.get(d3m_static.KEY_D3MOUTPUTDIR) if output_path: temp_path = join(output_path, 'temp') paths_to_check = [output_path, temp_path, d3m_config.env_values.get(d3m_static.KEY_D3MLOCALDIR), d3m_config.env_values.get(d3m_static.KEY_D3MSTATICDIR)] paths_to_build = [x for x in paths_to_check if x and not isdir(x)] fail_info = [] for build_path in paths_to_build: path_info = create_directory(build_path) if path_info.success: print('directory created: ', build_path) else: err_msg = 'Failed to build directory: %s' % (path_info.err_msg) fail_info.append(err_msg) if fail_info: return err_resp('\n'.join(fail_info)) return ok_resp('looks good')
def save_datamart_file(data_foldername, file_data, **kwargs): """Save materialize response as a file. This should be a .zip containing both a datafile and a datasetDoc.json""" if not file_data: return err_resp('"file_data" must be specified') # create directory if it doesn't exist # (Ok if the directory already exists) # dir_info = create_directory(data_foldername) if not dir_info.success: return err_resp(dir_info.err_msg) try: with zipfile.ZipFile(BytesIO(file_data.content), 'r') as data_zip: data_zip.extractall(data_foldername) except RuntimeError as err_obj: user_msg = (f'Failed to extract zip to "{data_foldername}".' f' Error: %s') % (err_obj, ) return err_resp(user_msg) # Make sure that learningData.csv exists # data_filepath = join(data_foldername, 'tables', 'learningData.csv') if not isfile(data_filepath): user_msg = ('File "learningData.csv" not found in expected' 'place: %s') % data_filepath return err_resp(user_msg) # Make sure that the datasetDoc.json exists # datasetdoc_path = join(data_foldername, 'datasetDoc.json') if not isfile(datasetdoc_path): user_msg = ('File datasetDoc.json not found in' ' expected place: %s') % datasetdoc_path return err_resp(user_msg) expected_filepath = kwargs.get('expected_filepath', None) if expected_filepath: if expected_filepath != data_filepath: user_msg = 'File not found on expected path: %s' % expected_filepath return err_resp(user_msg) return ok_resp({ dm_static.KEY_DATA_PATH: data_filepath, dm_static.KEY_DATASET_DOC_PATH: datasetdoc_path })
def create_destination_directory(user_workspace, name): """Used to add a write directory for the partials app""" if not isinstance(user_workspace, UserWorkspace): return err_resp( 'Error "user_workspace" must be a UserWorkspace object.') # build destination path for partials app dest_dir_path = os.path.join(user_workspace.d3m_config.additional_inputs, name, f'ws_{user_workspace.id}', get_timestamp_string()) new_dir_info = create_directory(dest_dir_path) if not new_dir_info.success: return { KEY_SUCCESS: False, KEY_DATA: f' {new_dir_info.err_msg} ({dest_dir_path})' } return {KEY_SUCCESS: True, KEY_DATA: dest_dir_path}
def create_image_output_dir(user_workspace=None): """Create an image output dir based on a user workspace For DEMO: TEMP write this to staticfiles """ if not isinstance(user_workspace, UserWorkspace): user_workspace_id = random_info.get_digits_string(3) #return {KEY_SUCCESS: False, # KEY_SUCCESS: 'user_workspace is not a "UserWorkspace" object'} else: user_workspace_id = user_workspace.id output_path = join(\ get_output_path_base(), im_static.IMAGE_MARKUP_DIR_NAME, f'{user_workspace_id}-{random_info.get_alphanumeric_lowercase(4)}', random_info.get_timestamp_string()) dir_info = create_directory(output_path) if not dir_info.success: return {KEY_SUCCESS: False, KEY_MESSAGE: dir_info.err_msg} return {KEY_SUCCESS: True, KEY_DATA: dir_info.result_obj}
def get_write_directory(self, kwargs_write_dir): """Determine the write directory""" if self.has_error(): return # Was it sent as a kwarg? if kwargs_write_dir and isdir(kwargs_write_dir): return kwargs_write_dir # Use the d3m_config connected to the user workspace # if self.user_workspace: output_dir = self.user_workspace.d3m_config.root_output_directory output_dir = join(output_dir, 'problems') else: # Use the default/hard-coded directory # output_dir = OUTPUT_PROBLEMS_DIR dir_info = create_directory(output_dir) if dir_info.success: return dir_info.result_obj self.add_err_msg(dir_info.err_msg) return None
def view_upload_dataset(request): """Upload dataset and metadata""" print('FILE_UPLOAD_MAX_MEMORY_SIZE:', settings.FILE_UPLOAD_MAX_MEMORY_SIZE) user_workspace_info = get_latest_user_workspace(request) if not user_workspace_info.success: return JsonResponse(get_json_error(user_workspace_info.err_msg)) user_workspace = user_workspace_info.result_obj # Destination directory for learningData.csv, learningData#.csv, etc. # and about.json # dest_dir_info = create_directory_add_timestamp(\ join(settings.TWORAVENS_USER_DATASETS_DIR, f'uploads_{user_workspace.user.id}', get_alpha_string(6))) if not dest_dir_info.success: return JsonResponse(get_json_error(dest_dir_info.err_msg)) dest_directory = dest_dir_info.result_obj print('view_upload_dataset. dest_directory', dest_directory) # Save the about.json # json_info = json_loads(request.POST.get('metadata')) if not json_info.success: return JsonResponse(get_json_error(json_info.err_msg)) # save json data dataset_name = None if dp_static.DATASET_NAME_FROM_UI in json_info.result_obj: dataset_name = json_info.result_obj[dp_static.DATASET_NAME_FROM_UI] #with open(os.path.join(dest_directory, 'about.json'), 'w') as metadata_file: # json.dump(json_info.result_obj, metadata_file) # Save data files. They don't have to be .csv, that's handled latter, # e.g. convert from .tab, .tsv, xls, etc. # for idx, file in enumerate(request.FILES.getlist('files')): print(file.name) _fname, fext = splitext(file.name) if not fext.lower() in dp_static.VALID_EXTENSIONS: # no extension found, won't be able to open it user_msg = ( f'The extension for this file was not recognized: "{file.name}".' f' Valid extensions: {", ".join(dp_static.VALID_EXTENSIONS)}.') return JsonResponse(get_json_error(user_msg)) new_filename = join( dest_directory, f'learningData{idx + 1 if idx else ""}{fext.lower()}') with open(new_filename, 'wb+') as outfile: for chunk in file.chunks(): outfile.write(chunk) print('dest_directory', dest_directory) # Create new dataset folders/etc # additional_inputs_dir = user_workspace.d3m_config.additional_inputs created = create_directory(additional_inputs_dir) if not created.success: return JsonResponse(get_json_error(created.err_msg)) new_dataset_info = UserDatasetUtil.make_new_dataset(\ user_workspace.user.id, dest_directory, settings.TWORAVENS_USER_DATASETS_DIR, **{dp_static.DATASET_NAME: dataset_name}) if not new_dataset_info.success: return JsonResponse(get_json_error(new_dataset_info.err_msg)) #udu = UserDatasetUtil(1, input_files, output_dir) return JsonResponse(get_json_success('file upload completed successfully'))
def make_doc(self): """Create the docs""" if self.has_error(): return dataset_id = self.about['datasetName'].replace(' ', '_') # construct a mapping to output paths inout_data_paths = OrderedDict() print('-- Iterate through input files --') for src_data_path in self.input_data_paths: offset = 1 print('src_data_path', src_data_path) file_ext = splitext(src_data_path)[1].lower() if not file_ext in dp_static.VALID_EXTENSIONS: print(' -> Invalid extension, skipping: ', file_ext) continue # Set the output file name: learningData.csv, learningData_01.csv, etc. filename = 'learningData' candidate_name = join('tables', filename + '.csv') while candidate_name in inout_data_paths.values(): offset += 1 offset_str = f'_{str(offset).zfill(2)}' #_name, extension = os.path.splitext(os.path.basename(src_data_path)) candidate_name = join('tables', f'{filename}{offset_str}.csv') inout_data_paths[src_data_path] = candidate_name print(' -> post-conversion name:', candidate_name) print('inout_data_paths', inout_data_paths) def infer_roles(column_name): """Infer column role""" roles = [] if column_name == 'd3mIndex': roles.append('index') elif column_name in self.targets: roles.append('suggestedTarget') else: roles.append('attribute') if column_name in self.problem.get('time', []): roles.append('timeIndicator') return roles target_configs = [] # individually load, index, analyze, and save each dataset resource_configs = [] # Iterate through input files / proposed output files # - Open the input file and write it as a .csv # - From each input file, gather information for the dataset doc # for input_path, output_data_path in inout_data_paths.items(): #print('Doc Maker 3: Attempt to read:', input_path) data_info = self.d3m_load_resource(input_path) if not data_info.success: self.add_err_msg(data_info.err_msg) return data = data_info.result_obj if not isinstance(data, pd.DataFrame): user_msg = (f'Failed to load the file into a' f' data frame: {input_path}') self.add_err_msg(user_msg) return resourceID = splitext(basename(input_path))[0] columnConfigs = [] for colIndex, (colName, colType) in enumerate( zip(data.columns.values, data.dtypes)): columnConfig = { 'colIndex': colIndex, 'colName': colName, 'colType': dp_static.DTYPES.get(str(colType), None) or 'unknown', 'role': infer_roles(colName) } columnConfigs.append(columnConfig) if columnConfig['role'][0] == 'suggestedTarget': target_configs.append({ 'resID': resourceID, 'colIndex': colIndex, 'colName': colName }) # output_data_path = join('tables', 'learningData.csv') resource_configs.append({ 'resID': resourceID, 'resPath': output_data_path, 'resType': 'table', 'resFormat': { "text/csv": ["csv"] }, 'isCollection': False, 'columns': [{ 'colIndex': i, 'colName': column[0], 'colType': dp_static.DTYPES.get(str(column[1]), None) or 'unknown', 'role': infer_roles(column[0]) } for i, column in enumerate( zip(data.columns.values, data.dtypes))] }) final_data_file_path = join(self.dataset_output_dir, output_data_path) dir_info = create_directory(dirname(final_data_file_path)) if not dir_info.success: self.add_err_msg(dir_info.err_msg) return data.to_csv(final_data_file_path, index=False) # write dataset config self.dataset_doc_path = join(self.dataset_output_dir, 'datasetDoc.json') with open(self.dataset_doc_path, 'w') as dataset_doc: dataset_doc.write( json.dumps( { 'about': { **{ 'datasetID': dataset_id, 'datasetSchemaVersion': dp_static.DATASET_SCHEMA_VERSION, 'redacted': True, 'digest': hashlib.sha256(self.about['datasetName'].encode( )).hexdigest() }, **self.about }, 'dataResources': resource_configs }, indent=4))