Esempio n. 1
0
def save_container(container: typing.Any, output_dir: str) -> None:
    # Saving data.
    if isinstance(container, container_module.Dataset):
        dataset_root_metadata = container.metadata.query(())

        missing_metadata: typing.Dict = {}
        for d3m_path, (
                dataset_path,
                required) in dataset_module.D3M_TO_DATASET_FIELDS.items():
            if not required:
                continue

            if utils.get_dict_path(dataset_root_metadata,
                                   dataset_path) is None:
                # TODO: Use some better value instead of this random value?
                utils.set_dict_path(missing_metadata, dataset_path,
                                    str(uuid.uuid4()))

        if missing_metadata:
            container = container.copy()
            container.metadata = container.metadata.update((),
                                                           missing_metadata)

        # Dataset saver creates any missing directories.
        dataset_uri = utils.path_to_uri(
            os.path.abspath(os.path.join(output_dir, 'datasetDoc.json')))
        container.save(dataset_uri)
    else:
        # We do not want to override anything.
        os.makedirs(output_dir, exist_ok=False)
        dataframe_path = os.path.join(output_dir, 'data.csv')

        if isinstance(container, container_module.DataFrame):
            container.to_csv(dataframe_path)
        elif isinstance(container,
                        (container_module.List, container_module.ndarray)):
            container = container_module.DataFrame(container)
            container.to_csv(dataframe_path)
        else:
            raise exceptions.NotSupportedError(
                "Value with type '{value_type}' cannot be saved as a container type."
                .format(value_type=type(container)))

    # Saving metadata. This is just for debugging purposes, so we are
    # using "to_json_structure" and not "to_internal_json_structure".
    input_metadata = container.metadata.to_json_structure()
    metadata_path = os.path.join(output_dir, 'metadata.json')

    with open(metadata_path, 'w') as outfile:
        json.dump(input_metadata,
                  outfile,
                  indent=2,
                  sort_keys=True,
                  allow_nan=False)
Esempio n. 2
0
    def get_class(self) -> typing.Any:
        """
        Returns a class suitable for computing this metric.
        """

        # Importing here to prevent import cycle.
        from d3m import metrics

        if self in metrics.class_map:
            return metrics.class_map[self]  # type: ignore

        if self in self._additional_score_class_map:
            return self._additional_score_class_map[self]  # type: ignore

        raise exceptions.NotSupportedError(
            "Computing metric {metric} is not supported.".format(metric=self))
Esempio n. 3
0
def crawl_openml_task(
    datasets: typing.Dict[str, str],
    task_id: int,
    save_dir: str,
    *,
    data_pipeline: pipeline_module.Pipeline,
    data_params: typing.Dict[str, str] = None,
    context: metadata_base.Context,
    random_seed: int = 0,
    volumes_dir: str = None,
    scratch_dir: str = None,
    runtime_environment: pipeline_run_module.RuntimeEnvironment = None,
    dataset_resolver: typing.Callable = None,
    problem_resolver: typing.Callable = None,
    compute_digest: dataset_module.ComputeDigest = dataset_module.
    ComputeDigest.ONLY_IF_MISSING,
    strict_digest: bool = False,
) -> None:
    """
    A function that crawls an OpenML task and corresponding dataset, do the split using a data
    preparation pipeline, and stores the splits as D3M dataset and problem description.

    Parameters
    ----------
    datasets:
        A mapping between known dataset IDs and their paths. Is updated in-place.
    task_id:
        An integer representing and OpenML task id to crawl and convert.
    save_dir:
        A directory where to save datasets and problems.
    data_pipeline:
        A data preparation pipeline used for splitting.
    data_params:
        A dictionary that contains the hyper-parameters for the data prepration pipeline.
    context:
        In which context to run pipelines.
    random_seed:
        A random seed to use for every run. This control all randomness during the run.
    volumes_dir:
        Path to a directory with static files required by primitives.
    scratch_dir:
        Path to a directory to store any temporary files needed during execution.
    runtime_environment:
        A description of the runtime environment.
    dataset_resolver:
        A dataset resolver to use.
    problem_resolver:
        A problem description resolver to use.
    compute_digest:
        Compute a digest over the data?
    strict_digest:
        If computed digest does not match the one provided in metadata, raise an exception?
    """

    if dataset_resolver is None:
        dataset_resolver = dataset_module.get_dataset
    if problem_resolver is None:
        problem_resolver = problem_module.get_problem

    number_of_folds = runtime._get_number_of_folds(data_params)
    assert number_of_folds != 0

    problem_uri = f'https://www.openml.org/t/{task_id}'
    problem_description = problem_resolver(problem_uri,
                                           strict_digest=strict_digest)

    if len(problem_description['inputs']) != 1:
        raise exceptions.NotSupportedError(
            "OpenML problem descriptions with multiple inputs are not supported."
        )

    problem_description_input = problem_description['inputs'][0]
    input_dataset_id = problem_description_input['dataset_id']

    known_datasets_set = set(datasets.keys())
    needed_splits_set = set()
    # We make sure when splitting that the output dataset has the same ID as the input dataset
    # with additional suffix for split type, and we are taking the advantage of this here.
    # The naming scheme matches "runtime._get_split_dataset_id".
    if number_of_folds == 1:
        needed_splits_set.add(f'{input_dataset_id}_TRAIN')
        needed_splits_set.add(f'{input_dataset_id}_TEST')
        needed_splits_set.add(f'{input_dataset_id}_SCORE')
        dataset_view_maps = [{
            'train': [
                {
                    'from': input_dataset_id,
                    'to': f'{input_dataset_id}_TRAIN',
                },
            ],
            'test': [
                {
                    'from': input_dataset_id,
                    'to': f'{input_dataset_id}_TEST',
                },
            ],
            'score': [
                {
                    'from': input_dataset_id,
                    'to': f'{input_dataset_id}_SCORE',
                },
            ],
        }]
    else:
        dataset_view_maps = []
        for fold_index in range(number_of_folds):
            needed_splits_set.add(
                f'{input_dataset_id}_FOLD_{fold_index}_TRAIN')
            needed_splits_set.add(f'{input_dataset_id}_FOLD_{fold_index}_TEST')
            needed_splits_set.add(
                f'{input_dataset_id}_FOLD_{fold_index}_SCORE')
            dataset_view_maps.append({
                'train': [
                    {
                        'from': input_dataset_id,
                        'to': f'{input_dataset_id}_FOLD_{fold_index}_TRAIN',
                    },
                ],
                'test': [
                    {
                        'from': input_dataset_id,
                        'to': f'{input_dataset_id}_FOLD_{fold_index}_TEST',
                    },
                ],
                'score': [
                    {
                        'from': input_dataset_id,
                        'to': f'{input_dataset_id}_FOLD_{fold_index}_SCORE',
                    },
                ],
            })

    # We already have this split, we can just reuse it.
    if problem_description_input[
            'dataset_id'] in known_datasets_set and needed_splits_set <= known_datasets_set:
        logger.debug("Copying existing splits.")

        # Copy splits.
        if number_of_folds == 1:
            view_maps = dataset_view_maps[0]
            for split_type in ['train', 'test', 'score']:
                shutil.copytree(
                    os.path.dirname(
                        datasets[runtime._get_dataset_id_from_view_maps(
                            view_maps, split_type, input_dataset_id)]),
                    os.path.join(save_dir, split_type.upper(),
                                 f'dataset_{split_type.upper()}'),
                )

                # Save problem description for the split. We do not copy because we copy only datasets.
                problem_path = os.path.abspath(
                    os.path.join(save_dir, split_type.upper(),
                                 f'problem_{split_type.upper()}',
                                 'problemDoc.json'))
                runtime._save_problem_description(problem_description,
                                                  problem_path,
                                                  dataset_view_maps=view_maps)
        else:
            for fold_index, view_maps in enumerate(dataset_view_maps):
                for split_type in ['train', 'test', 'score']:
                    shutil.copytree(
                        os.path.dirname(
                            datasets[runtime._get_dataset_id_from_view_maps(
                                view_maps, split_type, input_dataset_id)]),
                        os.path.join(save_dir, 'folds', str(fold_index),
                                     split_type.upper(),
                                     f'dataset_{split_type.upper()}'),
                    )

                    # Save problem description for the split. We do not copy because we copy only datasets.
                    problem_path = os.path.abspath(
                        os.path.join(save_dir, 'folds', str(fold_index),
                                     split_type.upper(),
                                     f'problem_{split_type.upper()}',
                                     'problemDoc.json'))
                    runtime._save_problem_description(
                        problem_description,
                        problem_path,
                        dataset_view_maps=view_maps)

        # Copy data preparation pipeline run pickle.
        shutil.copy2(
            os.path.join(os.path.dirname(datasets[input_dataset_id]), '..',
                         runtime.DATA_PIPELINE_RUN_FILENAME),
            os.path.join(save_dir, runtime.DATA_PIPELINE_RUN_FILENAME),
        )

        # Copy full dataset.
        shutil.copytree(
            os.path.dirname(datasets[input_dataset_id]),
            os.path.join(save_dir, input_dataset_id),
        )

    else:
        logger.debug("Running a data preparation pipeline.")

        openml_dataset_id = int(input_dataset_id.split('_')[-1])
        dataset_uri = f'https://www.openml.org/d/{openml_dataset_id}'
        dataset = dataset_resolver(
            dataset_uri,
            compute_digest=compute_digest,
            strict_digest=strict_digest,
        )
        dataset_id = dataset.metadata.query_field((), 'id')

        if input_dataset_id != dataset_id:
            raise exceptions.InvalidDatasetError(
                f"Loaded dataset (\"{dataset_id}\") does not have the expected dataset ID (\"{input_dataset_id}\")."
            )

        # Make splits and save them. This saves the pipeline run made by the data preparation pipeline, too.
        runtime.prepare_data_and_save(
            save_dir=save_dir,
            inputs=[dataset],
            data_pipeline=data_pipeline,
            problem_description=problem_description,
            data_params=data_params,
            context=context,
            random_seed=random_seed,
            volumes_dir=volumes_dir,
            scratch_dir=scratch_dir,
            runtime_environment=runtime_environment,
            # We provide "dataset_view_maps" to force split dataset IDs.
            dataset_view_maps=dataset_view_maps,
        )

        # Save full dataset.
        dataset_path = os.path.abspath(
            os.path.join(save_dir, dataset_id, 'datasetDoc.json'))
        dataset_uri = utils.path_to_uri(dataset_path)
        dataset.save(dataset_uri)

        # Updating known datasets.
        datasets[dataset_id] = dataset_path
        # We make sure when splitting that the output dataset has the same ID as the input dataset
        # with additional suffix for split type, and we are taking the advantage of this here.
        # The naming scheme matches "runtime._get_split_dataset_id".
        if number_of_folds == 1:
            for split_type in ['TRAIN', 'TEST', 'SCORE']:
                datasets[f'{dataset_id}_{split_type}'] = os.path.join(
                    save_dir, split_type, f'dataset_{split_type}',
                    'datasetDoc.json')
        else:
            for fold_index in range(number_of_folds):
                for split_type in ['TRAIN', 'TEST', 'SCORE']:
                    datasets[
                        f'{dataset_id}_FOLD_{fold_index}_{split_type}'] = os.path.join(
                            save_dir, 'folds', str(fold_index), split_type,
                            f'dataset_{split_type}', 'datasetDoc.json')

    # Save problem description. For splits, problem description is saved by "runtime.prepare_data_and_save".
    problem_path = os.path.abspath(
        os.path.join(save_dir, problem_description['id'], 'problemDoc.json'))
    # We do not save "dataset_view_maps" for this problem description.
    runtime._save_problem_description(problem_description, problem_path)
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        # obtain the path to dataset
        temp_json = inputs.to_json_structure()
        datasetDoc_uri = temp_json['location_uris'][0][7:]
        location_base_uri = '/'.join(datasetDoc_uri.split('/')[:-1])

        with open(datasetDoc_uri) as json_file:
            datasetDoc_json = json.load(json_file)
            dataResources = datasetDoc_json['dataResources']

        # get the task type from the task docs
        temp_path = datasetDoc_uri.split('/')
        problemDoc_uri = '/'.join(temp_path[:-2]) + '/' + '/'.join(
            temp_path[-2:]).replace('dataset', 'problem')

        with open(problemDoc_uri) as json_file:
            task_types = json.load(json_file)['about']['taskKeywords']

        # TODO consider avoiding explicit use of problem type throughout pipeline
        TASK = ""
        for task in task_types:
            if task in [
                    "communityDetection", "linkPrediction",
                    "vertexClassification", "graphMatching"
            ]:
                TASK = task
        if TASK == "":
            raise exceptions.NotSupportedError(
                "only graph tasks are supported")

        # load the graphs and convert to a networkx object
        graphs = []
        nodeIDs = []
        for i in dataResources:
            if i['resType'] == "table":
                if i['resID'] == 'learningData':
                    df = inputs['learningData']
                else:
                    node_list = pd.read_csv(location_base_uri + "/" +
                                            i['resPath'])

                    # assume it is a nodeList otherwise. currently, there
                    # aren't any D3M nodeList datasets that have more than one
                    # graph. furthermore, even if there was such, there isn't
                    # a way to match an edgeList to a nodeList. hence, we have
                    # to assume thatnodeList corresponds to the first graph
                    graph = graphs[0]

                    # the following block essentially catches VXTC synthetic
                    # dataset and overwrites nodeList indices withh edgeList.
                    # without a doubt not an AutoML way, but is necessary
                    first_idx_edge = str(
                        sorted(list(graph.nodes(data=False)))[0])
                    first_idx_node = str(sorted(list(node_list['nodeID']))[0])
                    if (first_idx_edge.isdigit() and first_idx_node.isdigit()
                            and int(first_idx_edge) != int(first_idx_node)):
                        node_list = node_list.sort_values(
                            'nodeID').reset_index(drop=True)
                        d3m_indices = np.sort(
                            np.array(list(
                                graph.nodes(data=False))).astype(int))
                        node_list['nodeID'] = d3m_indices

                    # make nodeID an index (so it is not used an attribute)
                    node_list = node_list.set_index('nodeID')
                    node_list.index = node_list.index.astype(str)

                    # iterate over attributes and assign them to nodes
                    for attribute in node_list.columns.tolist():
                        series = pd.Series(node_list[attribute],
                                           index=node_list.index)
                        nx.set_node_attributes(graph, series.to_dict(),
                                               attribute)

            elif i['resType'] == 'graph':
                graph_temp = nx.read_gml(location_base_uri + "/" +
                                         i['resPath'])
                graphs.append(graph_temp)
                if TASK in ["communityDetection", "vertexClassification"]:
                    nodeIDs_temp = list(
                        nx.get_node_attributes(graphs[0], 'nodeID').values())
                    nodeIDs_temp = np.array([str(i) for i in nodeIDs_temp])
                    nodeIDs_temp = container.ndarray(nodeIDs_temp)
                    nodeIDs.append(nodeIDs_temp)
            elif i['resType'] == "edgeList":
                temp_graph = self._read_edgelist(
                    location_base_uri + "/" + i['resPath'],
                    i["columns"],
                )
                graphs.append(temp_graph)
                if TASK in ["communityDetection", "vertexClassification"]:
                    nodeIDs_temp = list(temp_graph.nodes)
                    nodeIDs_temp = np.array([str(i) for i in nodeIDs_temp])
                    nodeIDs_temp = container.ndarray(nodeIDs_temp)
                    nodeIDs.append(nodeIDs_temp)

        return base.CallResult(container.List([df, graphs, nodeIDs, TASK]))
Esempio n. 5
0
def main(argv: typing.Sequence) -> None:
    raise exceptions.NotSupportedError(
        "This CLI has been removed. Use \"python3 -m d3m problem describe\" instead."
    )