def save_container(container: typing.Any, output_dir: str) -> None: # Saving data. if isinstance(container, container_module.Dataset): dataset_root_metadata = container.metadata.query(()) missing_metadata: typing.Dict = {} for d3m_path, ( dataset_path, required) in dataset_module.D3M_TO_DATASET_FIELDS.items(): if not required: continue if utils.get_dict_path(dataset_root_metadata, dataset_path) is None: # TODO: Use some better value instead of this random value? utils.set_dict_path(missing_metadata, dataset_path, str(uuid.uuid4())) if missing_metadata: container = container.copy() container.metadata = container.metadata.update((), missing_metadata) # Dataset saver creates any missing directories. dataset_uri = utils.path_to_uri( os.path.abspath(os.path.join(output_dir, 'datasetDoc.json'))) container.save(dataset_uri) else: # We do not want to override anything. os.makedirs(output_dir, exist_ok=False) dataframe_path = os.path.join(output_dir, 'data.csv') if isinstance(container, container_module.DataFrame): container.to_csv(dataframe_path) elif isinstance(container, (container_module.List, container_module.ndarray)): container = container_module.DataFrame(container) container.to_csv(dataframe_path) else: raise exceptions.NotSupportedError( "Value with type '{value_type}' cannot be saved as a container type." .format(value_type=type(container))) # Saving metadata. This is just for debugging purposes, so we are # using "to_json_structure" and not "to_internal_json_structure". input_metadata = container.metadata.to_json_structure() metadata_path = os.path.join(output_dir, 'metadata.json') with open(metadata_path, 'w') as outfile: json.dump(input_metadata, outfile, indent=2, sort_keys=True, allow_nan=False)
def get_class(self) -> typing.Any: """ Returns a class suitable for computing this metric. """ # Importing here to prevent import cycle. from d3m import metrics if self in metrics.class_map: return metrics.class_map[self] # type: ignore if self in self._additional_score_class_map: return self._additional_score_class_map[self] # type: ignore raise exceptions.NotSupportedError( "Computing metric {metric} is not supported.".format(metric=self))
def crawl_openml_task( datasets: typing.Dict[str, str], task_id: int, save_dir: str, *, data_pipeline: pipeline_module.Pipeline, data_params: typing.Dict[str, str] = None, context: metadata_base.Context, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None, runtime_environment: pipeline_run_module.RuntimeEnvironment = None, dataset_resolver: typing.Callable = None, problem_resolver: typing.Callable = None, compute_digest: dataset_module.ComputeDigest = dataset_module. ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, ) -> None: """ A function that crawls an OpenML task and corresponding dataset, do the split using a data preparation pipeline, and stores the splits as D3M dataset and problem description. Parameters ---------- datasets: A mapping between known dataset IDs and their paths. Is updated in-place. task_id: An integer representing and OpenML task id to crawl and convert. save_dir: A directory where to save datasets and problems. data_pipeline: A data preparation pipeline used for splitting. data_params: A dictionary that contains the hyper-parameters for the data prepration pipeline. context: In which context to run pipelines. random_seed: A random seed to use for every run. This control all randomness during the run. volumes_dir: Path to a directory with static files required by primitives. scratch_dir: Path to a directory to store any temporary files needed during execution. runtime_environment: A description of the runtime environment. dataset_resolver: A dataset resolver to use. problem_resolver: A problem description resolver to use. compute_digest: Compute a digest over the data? strict_digest: If computed digest does not match the one provided in metadata, raise an exception? """ if dataset_resolver is None: dataset_resolver = dataset_module.get_dataset if problem_resolver is None: problem_resolver = problem_module.get_problem number_of_folds = runtime._get_number_of_folds(data_params) assert number_of_folds != 0 problem_uri = f'https://www.openml.org/t/{task_id}' problem_description = problem_resolver(problem_uri, strict_digest=strict_digest) if len(problem_description['inputs']) != 1: raise exceptions.NotSupportedError( "OpenML problem descriptions with multiple inputs are not supported." ) problem_description_input = problem_description['inputs'][0] input_dataset_id = problem_description_input['dataset_id'] known_datasets_set = set(datasets.keys()) needed_splits_set = set() # We make sure when splitting that the output dataset has the same ID as the input dataset # with additional suffix for split type, and we are taking the advantage of this here. # The naming scheme matches "runtime._get_split_dataset_id". if number_of_folds == 1: needed_splits_set.add(f'{input_dataset_id}_TRAIN') needed_splits_set.add(f'{input_dataset_id}_TEST') needed_splits_set.add(f'{input_dataset_id}_SCORE') dataset_view_maps = [{ 'train': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_TRAIN', }, ], 'test': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_TEST', }, ], 'score': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_SCORE', }, ], }] else: dataset_view_maps = [] for fold_index in range(number_of_folds): needed_splits_set.add( f'{input_dataset_id}_FOLD_{fold_index}_TRAIN') needed_splits_set.add(f'{input_dataset_id}_FOLD_{fold_index}_TEST') needed_splits_set.add( f'{input_dataset_id}_FOLD_{fold_index}_SCORE') dataset_view_maps.append({ 'train': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_FOLD_{fold_index}_TRAIN', }, ], 'test': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_FOLD_{fold_index}_TEST', }, ], 'score': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_FOLD_{fold_index}_SCORE', }, ], }) # We already have this split, we can just reuse it. if problem_description_input[ 'dataset_id'] in known_datasets_set and needed_splits_set <= known_datasets_set: logger.debug("Copying existing splits.") # Copy splits. if number_of_folds == 1: view_maps = dataset_view_maps[0] for split_type in ['train', 'test', 'score']: shutil.copytree( os.path.dirname( datasets[runtime._get_dataset_id_from_view_maps( view_maps, split_type, input_dataset_id)]), os.path.join(save_dir, split_type.upper(), f'dataset_{split_type.upper()}'), ) # Save problem description for the split. We do not copy because we copy only datasets. problem_path = os.path.abspath( os.path.join(save_dir, split_type.upper(), f'problem_{split_type.upper()}', 'problemDoc.json')) runtime._save_problem_description(problem_description, problem_path, dataset_view_maps=view_maps) else: for fold_index, view_maps in enumerate(dataset_view_maps): for split_type in ['train', 'test', 'score']: shutil.copytree( os.path.dirname( datasets[runtime._get_dataset_id_from_view_maps( view_maps, split_type, input_dataset_id)]), os.path.join(save_dir, 'folds', str(fold_index), split_type.upper(), f'dataset_{split_type.upper()}'), ) # Save problem description for the split. We do not copy because we copy only datasets. problem_path = os.path.abspath( os.path.join(save_dir, 'folds', str(fold_index), split_type.upper(), f'problem_{split_type.upper()}', 'problemDoc.json')) runtime._save_problem_description( problem_description, problem_path, dataset_view_maps=view_maps) # Copy data preparation pipeline run pickle. shutil.copy2( os.path.join(os.path.dirname(datasets[input_dataset_id]), '..', runtime.DATA_PIPELINE_RUN_FILENAME), os.path.join(save_dir, runtime.DATA_PIPELINE_RUN_FILENAME), ) # Copy full dataset. shutil.copytree( os.path.dirname(datasets[input_dataset_id]), os.path.join(save_dir, input_dataset_id), ) else: logger.debug("Running a data preparation pipeline.") openml_dataset_id = int(input_dataset_id.split('_')[-1]) dataset_uri = f'https://www.openml.org/d/{openml_dataset_id}' dataset = dataset_resolver( dataset_uri, compute_digest=compute_digest, strict_digest=strict_digest, ) dataset_id = dataset.metadata.query_field((), 'id') if input_dataset_id != dataset_id: raise exceptions.InvalidDatasetError( f"Loaded dataset (\"{dataset_id}\") does not have the expected dataset ID (\"{input_dataset_id}\")." ) # Make splits and save them. This saves the pipeline run made by the data preparation pipeline, too. runtime.prepare_data_and_save( save_dir=save_dir, inputs=[dataset], data_pipeline=data_pipeline, problem_description=problem_description, data_params=data_params, context=context, random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir, runtime_environment=runtime_environment, # We provide "dataset_view_maps" to force split dataset IDs. dataset_view_maps=dataset_view_maps, ) # Save full dataset. dataset_path = os.path.abspath( os.path.join(save_dir, dataset_id, 'datasetDoc.json')) dataset_uri = utils.path_to_uri(dataset_path) dataset.save(dataset_uri) # Updating known datasets. datasets[dataset_id] = dataset_path # We make sure when splitting that the output dataset has the same ID as the input dataset # with additional suffix for split type, and we are taking the advantage of this here. # The naming scheme matches "runtime._get_split_dataset_id". if number_of_folds == 1: for split_type in ['TRAIN', 'TEST', 'SCORE']: datasets[f'{dataset_id}_{split_type}'] = os.path.join( save_dir, split_type, f'dataset_{split_type}', 'datasetDoc.json') else: for fold_index in range(number_of_folds): for split_type in ['TRAIN', 'TEST', 'SCORE']: datasets[ f'{dataset_id}_FOLD_{fold_index}_{split_type}'] = os.path.join( save_dir, 'folds', str(fold_index), split_type, f'dataset_{split_type}', 'datasetDoc.json') # Save problem description. For splits, problem description is saved by "runtime.prepare_data_and_save". problem_path = os.path.abspath( os.path.join(save_dir, problem_description['id'], 'problemDoc.json')) # We do not save "dataset_view_maps" for this problem description. runtime._save_problem_description(problem_description, problem_path)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # obtain the path to dataset temp_json = inputs.to_json_structure() datasetDoc_uri = temp_json['location_uris'][0][7:] location_base_uri = '/'.join(datasetDoc_uri.split('/')[:-1]) with open(datasetDoc_uri) as json_file: datasetDoc_json = json.load(json_file) dataResources = datasetDoc_json['dataResources'] # get the task type from the task docs temp_path = datasetDoc_uri.split('/') problemDoc_uri = '/'.join(temp_path[:-2]) + '/' + '/'.join( temp_path[-2:]).replace('dataset', 'problem') with open(problemDoc_uri) as json_file: task_types = json.load(json_file)['about']['taskKeywords'] # TODO consider avoiding explicit use of problem type throughout pipeline TASK = "" for task in task_types: if task in [ "communityDetection", "linkPrediction", "vertexClassification", "graphMatching" ]: TASK = task if TASK == "": raise exceptions.NotSupportedError( "only graph tasks are supported") # load the graphs and convert to a networkx object graphs = [] nodeIDs = [] for i in dataResources: if i['resType'] == "table": if i['resID'] == 'learningData': df = inputs['learningData'] else: node_list = pd.read_csv(location_base_uri + "/" + i['resPath']) # assume it is a nodeList otherwise. currently, there # aren't any D3M nodeList datasets that have more than one # graph. furthermore, even if there was such, there isn't # a way to match an edgeList to a nodeList. hence, we have # to assume thatnodeList corresponds to the first graph graph = graphs[0] # the following block essentially catches VXTC synthetic # dataset and overwrites nodeList indices withh edgeList. # without a doubt not an AutoML way, but is necessary first_idx_edge = str( sorted(list(graph.nodes(data=False)))[0]) first_idx_node = str(sorted(list(node_list['nodeID']))[0]) if (first_idx_edge.isdigit() and first_idx_node.isdigit() and int(first_idx_edge) != int(first_idx_node)): node_list = node_list.sort_values( 'nodeID').reset_index(drop=True) d3m_indices = np.sort( np.array(list( graph.nodes(data=False))).astype(int)) node_list['nodeID'] = d3m_indices # make nodeID an index (so it is not used an attribute) node_list = node_list.set_index('nodeID') node_list.index = node_list.index.astype(str) # iterate over attributes and assign them to nodes for attribute in node_list.columns.tolist(): series = pd.Series(node_list[attribute], index=node_list.index) nx.set_node_attributes(graph, series.to_dict(), attribute) elif i['resType'] == 'graph': graph_temp = nx.read_gml(location_base_uri + "/" + i['resPath']) graphs.append(graph_temp) if TASK in ["communityDetection", "vertexClassification"]: nodeIDs_temp = list( nx.get_node_attributes(graphs[0], 'nodeID').values()) nodeIDs_temp = np.array([str(i) for i in nodeIDs_temp]) nodeIDs_temp = container.ndarray(nodeIDs_temp) nodeIDs.append(nodeIDs_temp) elif i['resType'] == "edgeList": temp_graph = self._read_edgelist( location_base_uri + "/" + i['resPath'], i["columns"], ) graphs.append(temp_graph) if TASK in ["communityDetection", "vertexClassification"]: nodeIDs_temp = list(temp_graph.nodes) nodeIDs_temp = np.array([str(i) for i in nodeIDs_temp]) nodeIDs_temp = container.ndarray(nodeIDs_temp) nodeIDs.append(nodeIDs_temp) return base.CallResult(container.List([df, graphs, nodeIDs, TASK]))
def main(argv: typing.Sequence) -> None: raise exceptions.NotSupportedError( "This CLI has been removed. Use \"python3 -m d3m problem describe\" instead." )