def test_basic(self):
        dataset_doc_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), 'data', 'datasets',
                         'image_dataset_1', 'datasetDoc.json'))

        dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path))

        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams(
        )
        dataframe_primitive = DatasetToDataFramePrimitive(
            hyperparams=dataframe_hyperparams_class.defaults().replace(
                {'dataframe_resource': '0'}))
        dataframe = dataframe_primitive.produce(inputs=dataset).value

        image_hyperparams_class = DummyImageReaderPrimitive.metadata.get_hyperparams(
        )
        image_primitive = DummyImageReaderPrimitive(
            hyperparams=image_hyperparams_class.defaults().replace(
                {'return_result': 'replace'}))
        images_names = image_primitive.produce(inputs=dataframe).value

        self.assertEqual(images_names.iloc[0]['filename'][0],
                         '001_HandPhoto_left_01.jpg')
        self.assertEqual(images_names.iloc[1]['filename'][0],
                         'cifar10_bird_1.png')
        self.assertEqual(images_names.iloc[2]['filename'][0],
                         'cifar10_bird_2.png')
        self.assertEqual(images_names.iloc[3]['filename'][0], 'mnist_0_2.png')
        self.assertEqual(images_names.iloc[4]['filename'][0], 'mnist_1_1.png')

        self._test_metadata(images_names.metadata)
Exemple #2
0
    def test_save_d3m_problem(self):
        self.maxDiff = None

        problem_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json'))

        problem_uri = utils.path_to_uri(problem_doc_path)
        problem_description = problem.Problem.load(problem_uri)

        problem_path = os.path.join(os.path.abspath(self.test_dir), 'problem', 'problemDoc.json')
        saved_problem_uri = utils.path_to_uri(problem_path)
        problem_description.save(saved_problem_uri)
        saved_problem_description = problem.Problem.load(saved_problem_uri)

        original = problem_description.to_simple_structure()
        saved = saved_problem_description.to_simple_structure()
        del original['location_uris']
        del saved['location_uris']

        self.assertEqual(original, saved)
Exemple #3
0
def save_container(container: typing.Any, output_dir: str) -> None:
    # Saving data.
    if isinstance(container, container_module.Dataset):
        dataset_root_metadata = container.metadata.query(())

        missing_metadata: typing.Dict = {}
        for d3m_path, (
                dataset_path,
                required) in dataset_module.D3M_TO_DATASET_FIELDS.items():
            if not required:
                continue

            if utils.get_dict_path(dataset_root_metadata,
                                   dataset_path) is None:
                # TODO: Use some better value instead of this random value?
                utils.set_dict_path(missing_metadata, dataset_path,
                                    str(uuid.uuid4()))

        if missing_metadata:
            container = container.copy()
            container.metadata = container.metadata.update((),
                                                           missing_metadata)

        # Dataset saver creates any missing directories.
        dataset_uri = utils.path_to_uri(
            os.path.abspath(os.path.join(output_dir, 'datasetDoc.json')))
        container.save(dataset_uri)
    else:
        # We do not want to override anything.
        os.makedirs(output_dir, exist_ok=False)
        dataframe_path = os.path.join(output_dir, 'data.csv')

        if isinstance(container, container_module.DataFrame):
            container.to_csv(dataframe_path)
        elif isinstance(container,
                        (container_module.List, container_module.ndarray)):
            container = container_module.DataFrame(container)
            container.to_csv(dataframe_path)
        else:
            raise exceptions.NotSupportedError(
                "Value with type '{value_type}' cannot be saved as a container type."
                .format(value_type=type(container)))

    # Saving metadata. This is just for debugging purposes, so we are
    # using "to_json_structure" and not "to_internal_json_structure".
    input_metadata = container.metadata.to_json_structure()
    metadata_path = os.path.join(output_dir, 'metadata.json')

    with open(metadata_path, 'w') as outfile:
        json.dump(input_metadata,
                  outfile,
                  indent=2,
                  sort_keys=True,
                  allow_nan=False)
Exemple #4
0
    def test_conversion(self):
        problem_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json'))

        problem_uri = utils.path_to_uri(problem_doc_path)

        problem_description = problem.Problem.load(problem_uri)

        self.assertEqual(problem_description.to_simple_structure(), problem.Problem.from_json_structure(problem_description.to_json_structure(), strict_digest=True).to_simple_structure())

        # Legacy.
        self.assertEqual(utils.to_json_structure(problem_description.to_simple_structure()), problem.Problem.from_json_structure(utils.to_json_structure(problem_description.to_simple_structure()), strict_digest=True).to_simple_structure())

        self.assertIs(problem.Problem.from_json_structure(problem_description.to_json_structure(), strict_digest=True)['problem']['task_keywords'][0], problem.TaskKeyword.CLASSIFICATION)
Exemple #5
0
    def test_regression(self):
        dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json'))

        dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path))

        # We set semantic types like runtime would.
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')

        random = numpy.random.RandomState(42)

        # Create a synthetic prediction DataFrame.
        d3mIndex = dataset['learningData'].iloc[:, 0].astype(int)
        value = random.randn(len(d3mIndex))
        predictions = container.DataFrame({'d3mIndex': d3mIndex, 'value': value}, generate_metadata=True)
        shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True)

        hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams()
        metrics_class = hyperparams_class.configuration['metrics'].elements
        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'MEAN_SQUARED_ERROR',
                'pos_label': None,
                'k': None,
            }), metrics_class({
                'metric': 'ROOT_MEAN_SQUARED_ERROR',
                'pos_label': None,
                'k': None,
            }), metrics_class({
                'metric': 'MEAN_ABSOLUTE_ERROR',
                'pos_label': None,
                'k': None,
            }), metrics_class({
                'metric': 'R_SQUARED',
                'pos_label': None,
                'k': None,
            })],
        }))

        for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]):
            scores = primitive.produce(inputs=pred, score_dataset=dataset).value
            self.assertEqual(scores.values.tolist(), [
                ['MEAN_SQUARED_ERROR', 3112.184932446708, 0.08521485450672399],
                ['ROOT_MEAN_SQUARED_ERROR', 55.786960236660214, 0.9721137517700256],
                ['MEAN_ABSOLUTE_ERROR', 54.579668078204385, 0.9727169385086356],
                ['R_SQUARED', -22.62418041588221, 0.9881884591239001],
            ], name)

            self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name)
            self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name)
Exemple #6
0
    def test_multivariate(self):
        dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'multivariate_dataset_1', 'datasetDoc.json'))

        dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path))

        # We set semantic types like runtime would.
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Target')
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute')
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Target')
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute')

        random = numpy.random.RandomState(42)

        # Create a synthetic prediction DataFrame.
        d3mIndex = dataset['learningData'].iloc[:, 0].astype(int)
        amplitude = random.randn(len(d3mIndex))
        lengthscale = random.randn(len(d3mIndex))
        predictions = container.DataFrame({'d3mIndex': d3mIndex, 'amplitude': amplitude, 'lengthscale': lengthscale}, generate_metadata=True)
        shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True)

        hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams()
        metrics_class = hyperparams_class.configuration['metrics'].elements
        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'MEAN_SQUARED_ERROR',
                'pos_label': None,
                'k': None,
            }), metrics_class({
                'metric': 'ROOT_MEAN_SQUARED_ERROR',
                'pos_label': None,
                'k': None,
            }), metrics_class({
                'metric': 'MEAN_ABSOLUTE_ERROR',
                'pos_label': None,
                'k': None,
            })],
        }))

        for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]):
            scores = primitive.produce(inputs=pred, score_dataset=dataset).value
            self.assertEqual(scores.values.tolist(), [
                ['MEAN_SQUARED_ERROR', 1.7627871219522482, 0.9991186066672619],
                ['ROOT_MEAN_SQUARED_ERROR', 1.3243591896125282, 0.9993378205019783],
                ['MEAN_ABSOLUTE_ERROR', 1.043095768817859, 0.9994784521628801],
            ], name)

            self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name)
            self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name)
Exemple #7
0
    def test_object_detection_just_bounding_polygon(self):
        dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'object_dataset_1', 'datasetDoc.json'))

        dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path))

        # We set semantic types like runtime would.
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Target')
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute')

        random = numpy.random.RandomState(42)

        # Create a synthetic prediction DataFrame.
        predictions = container.DataFrame([
            [0, '330,463,330,505,387,505,387,463', 0.0739],
            [0, '420,433,420,498,451,498,451,433', 0.091],
            [0, '328,465,328,540,403,540,403,465', 0.1008],
            [0, '480,477,480,522,508,522,508,477', 0.1012],
            [0, '357,460,357,537,417,537,417,460', 0.1058],
            [0, '356,456,356,521,391,521,391,456', 0.0843],
            [1, '345,460,345,547,415,547,415,460', 0.0539],
            [1, '381,362,381,513,455,513,455,362', 0.0542],
            [1, '382,366,382,422,416,422,416,366', 0.0559],
            [1, '730,463,730,583,763,583,763,463', 0.0588],
        ], columns=['d3mIndex', 'bounding_polygon_area', 'confidence'], generate_metadata=True)
        shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True)

        hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams()
        metrics_class = hyperparams_class.configuration['metrics'].elements
        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'OBJECT_DETECTION_AVERAGE_PRECISION',
                'pos_label': None,
                'k': None,
            })],
        }))

        for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]):
            scores = primitive.produce(inputs=pred, score_dataset=dataset).value
            self.assertEqual(scores.values.tolist(), [
                ['OBJECT_DETECTION_AVERAGE_PRECISION', 0.125, 0.125],
            ], name)

            self.assertEqual(scores.metadata.query_column(0)['name'], 'metric')
            self.assertEqual(scores.metadata.query_column(1)['name'], 'value')
            self.assertEqual(scores.metadata.query_column(2)['name'], 'normalized')
Exemple #8
0
    def test_classification(self):
        dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))

        dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path))

        # We set semantic types like runtime would.
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
        dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

        random = numpy.random.RandomState(42)

        # Create a synthetic prediction DataFrame.
        d3mIndex = dataset['learningData'].iloc[:, 0].astype(int)
        species = random.choice(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], len(d3mIndex))
        predictions = container.DataFrame({'d3mIndex': d3mIndex, 'species': species}, generate_metadata=True)
        shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True)

        hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams()
        metrics_class = hyperparams_class.configuration['metrics'].elements
        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'ACCURACY',
                'pos_label': None,
                'k': None,
            }), metrics_class({
                'metric': 'F1_MICRO',
                'pos_label': None,
                'k': None,
            }), metrics_class({
                'metric': 'F1_MACRO',
                'pos_label': None,
                'k': None,
            })],
        }))

        for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]):
            scores = primitive.produce(inputs=pred, score_dataset=dataset).value
            self.assertEqual(scores.values.tolist(), [
                ['ACCURACY', 0.4066666666666667, 0.4066666666666667],
                ['F1_MICRO', 0.4066666666666667, 0.4066666666666667],
                ['F1_MACRO', 0.4051068540623797, 0.4051068540623797],
            ], name)

            self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name)
            self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name)
Exemple #9
0
def crawl_openml_task(
    datasets: typing.Dict[str, str],
    task_id: int,
    save_dir: str,
    *,
    data_pipeline: pipeline_module.Pipeline,
    data_params: typing.Dict[str, str] = None,
    context: metadata_base.Context,
    random_seed: int = 0,
    volumes_dir: str = None,
    scratch_dir: str = None,
    runtime_environment: pipeline_run_module.RuntimeEnvironment = None,
    dataset_resolver: typing.Callable = None,
    problem_resolver: typing.Callable = None,
    compute_digest: dataset_module.ComputeDigest = dataset_module.
    ComputeDigest.ONLY_IF_MISSING,
    strict_digest: bool = False,
) -> None:
    """
    A function that crawls an OpenML task and corresponding dataset, do the split using a data
    preparation pipeline, and stores the splits as D3M dataset and problem description.

    Parameters
    ----------
    datasets:
        A mapping between known dataset IDs and their paths. Is updated in-place.
    task_id:
        An integer representing and OpenML task id to crawl and convert.
    save_dir:
        A directory where to save datasets and problems.
    data_pipeline:
        A data preparation pipeline used for splitting.
    data_params:
        A dictionary that contains the hyper-parameters for the data prepration pipeline.
    context:
        In which context to run pipelines.
    random_seed:
        A random seed to use for every run. This control all randomness during the run.
    volumes_dir:
        Path to a directory with static files required by primitives.
    scratch_dir:
        Path to a directory to store any temporary files needed during execution.
    runtime_environment:
        A description of the runtime environment.
    dataset_resolver:
        A dataset resolver to use.
    problem_resolver:
        A problem description resolver to use.
    compute_digest:
        Compute a digest over the data?
    strict_digest:
        If computed digest does not match the one provided in metadata, raise an exception?
    """

    if dataset_resolver is None:
        dataset_resolver = dataset_module.get_dataset
    if problem_resolver is None:
        problem_resolver = problem_module.get_problem

    number_of_folds = runtime._get_number_of_folds(data_params)
    assert number_of_folds != 0

    problem_uri = f'https://www.openml.org/t/{task_id}'
    problem_description = problem_resolver(problem_uri,
                                           strict_digest=strict_digest)

    if len(problem_description['inputs']) != 1:
        raise exceptions.NotSupportedError(
            "OpenML problem descriptions with multiple inputs are not supported."
        )

    problem_description_input = problem_description['inputs'][0]
    input_dataset_id = problem_description_input['dataset_id']

    known_datasets_set = set(datasets.keys())
    needed_splits_set = set()
    # We make sure when splitting that the output dataset has the same ID as the input dataset
    # with additional suffix for split type, and we are taking the advantage of this here.
    # The naming scheme matches "runtime._get_split_dataset_id".
    if number_of_folds == 1:
        needed_splits_set.add(f'{input_dataset_id}_TRAIN')
        needed_splits_set.add(f'{input_dataset_id}_TEST')
        needed_splits_set.add(f'{input_dataset_id}_SCORE')
        dataset_view_maps = [{
            'train': [
                {
                    'from': input_dataset_id,
                    'to': f'{input_dataset_id}_TRAIN',
                },
            ],
            'test': [
                {
                    'from': input_dataset_id,
                    'to': f'{input_dataset_id}_TEST',
                },
            ],
            'score': [
                {
                    'from': input_dataset_id,
                    'to': f'{input_dataset_id}_SCORE',
                },
            ],
        }]
    else:
        dataset_view_maps = []
        for fold_index in range(number_of_folds):
            needed_splits_set.add(
                f'{input_dataset_id}_FOLD_{fold_index}_TRAIN')
            needed_splits_set.add(f'{input_dataset_id}_FOLD_{fold_index}_TEST')
            needed_splits_set.add(
                f'{input_dataset_id}_FOLD_{fold_index}_SCORE')
            dataset_view_maps.append({
                'train': [
                    {
                        'from': input_dataset_id,
                        'to': f'{input_dataset_id}_FOLD_{fold_index}_TRAIN',
                    },
                ],
                'test': [
                    {
                        'from': input_dataset_id,
                        'to': f'{input_dataset_id}_FOLD_{fold_index}_TEST',
                    },
                ],
                'score': [
                    {
                        'from': input_dataset_id,
                        'to': f'{input_dataset_id}_FOLD_{fold_index}_SCORE',
                    },
                ],
            })

    # We already have this split, we can just reuse it.
    if problem_description_input[
            'dataset_id'] in known_datasets_set and needed_splits_set <= known_datasets_set:
        logger.debug("Copying existing splits.")

        # Copy splits.
        if number_of_folds == 1:
            view_maps = dataset_view_maps[0]
            for split_type in ['train', 'test', 'score']:
                shutil.copytree(
                    os.path.dirname(
                        datasets[runtime._get_dataset_id_from_view_maps(
                            view_maps, split_type, input_dataset_id)]),
                    os.path.join(save_dir, split_type.upper(),
                                 f'dataset_{split_type.upper()}'),
                )

                # Save problem description for the split. We do not copy because we copy only datasets.
                problem_path = os.path.abspath(
                    os.path.join(save_dir, split_type.upper(),
                                 f'problem_{split_type.upper()}',
                                 'problemDoc.json'))
                runtime._save_problem_description(problem_description,
                                                  problem_path,
                                                  dataset_view_maps=view_maps)
        else:
            for fold_index, view_maps in enumerate(dataset_view_maps):
                for split_type in ['train', 'test', 'score']:
                    shutil.copytree(
                        os.path.dirname(
                            datasets[runtime._get_dataset_id_from_view_maps(
                                view_maps, split_type, input_dataset_id)]),
                        os.path.join(save_dir, 'folds', str(fold_index),
                                     split_type.upper(),
                                     f'dataset_{split_type.upper()}'),
                    )

                    # Save problem description for the split. We do not copy because we copy only datasets.
                    problem_path = os.path.abspath(
                        os.path.join(save_dir, 'folds', str(fold_index),
                                     split_type.upper(),
                                     f'problem_{split_type.upper()}',
                                     'problemDoc.json'))
                    runtime._save_problem_description(
                        problem_description,
                        problem_path,
                        dataset_view_maps=view_maps)

        # Copy data preparation pipeline run pickle.
        shutil.copy2(
            os.path.join(os.path.dirname(datasets[input_dataset_id]), '..',
                         runtime.DATA_PIPELINE_RUN_FILENAME),
            os.path.join(save_dir, runtime.DATA_PIPELINE_RUN_FILENAME),
        )

        # Copy full dataset.
        shutil.copytree(
            os.path.dirname(datasets[input_dataset_id]),
            os.path.join(save_dir, input_dataset_id),
        )

    else:
        logger.debug("Running a data preparation pipeline.")

        openml_dataset_id = int(input_dataset_id.split('_')[-1])
        dataset_uri = f'https://www.openml.org/d/{openml_dataset_id}'
        dataset = dataset_resolver(
            dataset_uri,
            compute_digest=compute_digest,
            strict_digest=strict_digest,
        )
        dataset_id = dataset.metadata.query_field((), 'id')

        if input_dataset_id != dataset_id:
            raise exceptions.InvalidDatasetError(
                f"Loaded dataset (\"{dataset_id}\") does not have the expected dataset ID (\"{input_dataset_id}\")."
            )

        # Make splits and save them. This saves the pipeline run made by the data preparation pipeline, too.
        runtime.prepare_data_and_save(
            save_dir=save_dir,
            inputs=[dataset],
            data_pipeline=data_pipeline,
            problem_description=problem_description,
            data_params=data_params,
            context=context,
            random_seed=random_seed,
            volumes_dir=volumes_dir,
            scratch_dir=scratch_dir,
            runtime_environment=runtime_environment,
            # We provide "dataset_view_maps" to force split dataset IDs.
            dataset_view_maps=dataset_view_maps,
        )

        # Save full dataset.
        dataset_path = os.path.abspath(
            os.path.join(save_dir, dataset_id, 'datasetDoc.json'))
        dataset_uri = utils.path_to_uri(dataset_path)
        dataset.save(dataset_uri)

        # Updating known datasets.
        datasets[dataset_id] = dataset_path
        # We make sure when splitting that the output dataset has the same ID as the input dataset
        # with additional suffix for split type, and we are taking the advantage of this here.
        # The naming scheme matches "runtime._get_split_dataset_id".
        if number_of_folds == 1:
            for split_type in ['TRAIN', 'TEST', 'SCORE']:
                datasets[f'{dataset_id}_{split_type}'] = os.path.join(
                    save_dir, split_type, f'dataset_{split_type}',
                    'datasetDoc.json')
        else:
            for fold_index in range(number_of_folds):
                for split_type in ['TRAIN', 'TEST', 'SCORE']:
                    datasets[
                        f'{dataset_id}_FOLD_{fold_index}_{split_type}'] = os.path.join(
                            save_dir, 'folds', str(fold_index), split_type,
                            f'dataset_{split_type}', 'datasetDoc.json')

    # Save problem description. For splits, problem description is saved by "runtime.prepare_data_and_save".
    problem_path = os.path.abspath(
        os.path.join(save_dir, problem_description['id'], 'problemDoc.json'))
    # We do not save "dataset_view_maps" for this problem description.
    runtime._save_problem_description(problem_description, problem_path)
Exemple #10
0
    def test_basic(self):
        self.maxDiff = None

        problem_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json'))

        problem_uri = utils.path_to_uri(problem_doc_path)

        problem_description = problem.Problem.load(problem_uri)

        self.assertEqual(problem_description.to_simple_structure(), {
            'id': 'iris_problem_1',
            'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5',
            'version': '4.0.0',
            'location_uris': [
                problem_uri,
            ],
            'name': 'Distinguish Iris flowers',
            'description': 'Distinguish Iris flowers of three related species.',
            'schema': problem.PROBLEM_SCHEMA_VERSION,
            'problem': {
                'task_keywords': [problem.TaskKeyword.CLASSIFICATION, problem.TaskKeyword.MULTICLASS],
                'performance_metrics': [
                    {
                        'metric': problem.PerformanceMetric.ACCURACY,
                    }
                ]
            },
            'inputs': [
                {
                    'dataset_id': 'iris_dataset_1',
                    'targets': [
                        {
                            'target_index': 0,
                            'resource_id': 'learningData',
                            'column_index': 5,
                            'column_name': 'species',
                        }
                    ]
                }
            ],
        })

        self.assertEqual(problem_description.to_json_structure(), {
            'id': 'iris_problem_1',
            'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5',
            'version': '4.0.0',
            'location_uris': [
                problem_uri,
            ],
            'name': 'Distinguish Iris flowers',
            'description': 'Distinguish Iris flowers of three related species.',
            'schema': problem.PROBLEM_SCHEMA_VERSION,
            'problem': {
                'task_keywords': [problem.TaskKeyword.CLASSIFICATION, problem.TaskKeyword.MULTICLASS],
                'performance_metrics': [
                    {
                        'metric': problem.PerformanceMetric.ACCURACY,
                    }
                ]
            },
            'inputs': [
                {
                    'dataset_id': 'iris_dataset_1',
                    'targets': [
                        {
                            'target_index': 0,
                            'resource_id': 'learningData',
                            'column_index': 5,
                            'column_name': 'species',
                        }
                    ]
                }
            ],
        })

        self.assertEqual(problem_description.to_json_structure(), {
            'id': 'iris_problem_1',
            'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5',
            'version': '4.0.0',
            'location_uris': [
                problem_uri,
            ],
            'name': 'Distinguish Iris flowers',
            'description': 'Distinguish Iris flowers of three related species.',
            'schema': problem.PROBLEM_SCHEMA_VERSION,
            'problem': {
                'task_keywords': ['CLASSIFICATION', 'MULTICLASS'],
                'performance_metrics': [
                    {
                        'metric': 'ACCURACY',
                    }
                ]
            },
            'inputs': [
                {
                    'dataset_id': 'iris_dataset_1',
                    'targets': [
                        {
                            'target_index': 0,
                            'resource_id': 'learningData',
                            'column_index': 5,
                            'column_name': 'species',
                        }
                    ]
                }
            ],
        })

        pipeline_run.validate_problem(problem_description.to_json_structure(canonical=True))
        problem.PROBLEM_SCHEMA_VALIDATOR.validate(problem_description.to_json_structure(canonical=True))