def test_basic(self): dataset_doc_path = os.path.abspath( os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'image_dataset_1', 'datasetDoc.json')) dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path)) dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams( ) dataframe_primitive = DatasetToDataFramePrimitive( hyperparams=dataframe_hyperparams_class.defaults().replace( {'dataframe_resource': '0'})) dataframe = dataframe_primitive.produce(inputs=dataset).value image_hyperparams_class = DummyImageReaderPrimitive.metadata.get_hyperparams( ) image_primitive = DummyImageReaderPrimitive( hyperparams=image_hyperparams_class.defaults().replace( {'return_result': 'replace'})) images_names = image_primitive.produce(inputs=dataframe).value self.assertEqual(images_names.iloc[0]['filename'][0], '001_HandPhoto_left_01.jpg') self.assertEqual(images_names.iloc[1]['filename'][0], 'cifar10_bird_1.png') self.assertEqual(images_names.iloc[2]['filename'][0], 'cifar10_bird_2.png') self.assertEqual(images_names.iloc[3]['filename'][0], 'mnist_0_2.png') self.assertEqual(images_names.iloc[4]['filename'][0], 'mnist_1_1.png') self._test_metadata(images_names.metadata)
def test_save_d3m_problem(self): self.maxDiff = None problem_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json')) problem_uri = utils.path_to_uri(problem_doc_path) problem_description = problem.Problem.load(problem_uri) problem_path = os.path.join(os.path.abspath(self.test_dir), 'problem', 'problemDoc.json') saved_problem_uri = utils.path_to_uri(problem_path) problem_description.save(saved_problem_uri) saved_problem_description = problem.Problem.load(saved_problem_uri) original = problem_description.to_simple_structure() saved = saved_problem_description.to_simple_structure() del original['location_uris'] del saved['location_uris'] self.assertEqual(original, saved)
def save_container(container: typing.Any, output_dir: str) -> None: # Saving data. if isinstance(container, container_module.Dataset): dataset_root_metadata = container.metadata.query(()) missing_metadata: typing.Dict = {} for d3m_path, ( dataset_path, required) in dataset_module.D3M_TO_DATASET_FIELDS.items(): if not required: continue if utils.get_dict_path(dataset_root_metadata, dataset_path) is None: # TODO: Use some better value instead of this random value? utils.set_dict_path(missing_metadata, dataset_path, str(uuid.uuid4())) if missing_metadata: container = container.copy() container.metadata = container.metadata.update((), missing_metadata) # Dataset saver creates any missing directories. dataset_uri = utils.path_to_uri( os.path.abspath(os.path.join(output_dir, 'datasetDoc.json'))) container.save(dataset_uri) else: # We do not want to override anything. os.makedirs(output_dir, exist_ok=False) dataframe_path = os.path.join(output_dir, 'data.csv') if isinstance(container, container_module.DataFrame): container.to_csv(dataframe_path) elif isinstance(container, (container_module.List, container_module.ndarray)): container = container_module.DataFrame(container) container.to_csv(dataframe_path) else: raise exceptions.NotSupportedError( "Value with type '{value_type}' cannot be saved as a container type." .format(value_type=type(container))) # Saving metadata. This is just for debugging purposes, so we are # using "to_json_structure" and not "to_internal_json_structure". input_metadata = container.metadata.to_json_structure() metadata_path = os.path.join(output_dir, 'metadata.json') with open(metadata_path, 'w') as outfile: json.dump(input_metadata, outfile, indent=2, sort_keys=True, allow_nan=False)
def test_conversion(self): problem_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json')) problem_uri = utils.path_to_uri(problem_doc_path) problem_description = problem.Problem.load(problem_uri) self.assertEqual(problem_description.to_simple_structure(), problem.Problem.from_json_structure(problem_description.to_json_structure(), strict_digest=True).to_simple_structure()) # Legacy. self.assertEqual(utils.to_json_structure(problem_description.to_simple_structure()), problem.Problem.from_json_structure(utils.to_json_structure(problem_description.to_simple_structure()), strict_digest=True).to_simple_structure()) self.assertIs(problem.Problem.from_json_structure(problem_description.to_json_structure(), strict_digest=True)['problem']['task_keywords'][0], problem.TaskKeyword.CLASSIFICATION)
def test_regression(self): dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path)) # We set semantic types like runtime would. dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') random = numpy.random.RandomState(42) # Create a synthetic prediction DataFrame. d3mIndex = dataset['learningData'].iloc[:, 0].astype(int) value = random.randn(len(d3mIndex)) predictions = container.DataFrame({'d3mIndex': d3mIndex, 'value': value}, generate_metadata=True) shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() metrics_class = hyperparams_class.configuration['metrics'].elements primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'MEAN_SQUARED_ERROR', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'ROOT_MEAN_SQUARED_ERROR', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'MEAN_ABSOLUTE_ERROR', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'R_SQUARED', 'pos_label': None, 'k': None, })], })) for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): scores = primitive.produce(inputs=pred, score_dataset=dataset).value self.assertEqual(scores.values.tolist(), [ ['MEAN_SQUARED_ERROR', 3112.184932446708, 0.08521485450672399], ['ROOT_MEAN_SQUARED_ERROR', 55.786960236660214, 0.9721137517700256], ['MEAN_ABSOLUTE_ERROR', 54.579668078204385, 0.9727169385086356], ['R_SQUARED', -22.62418041588221, 0.9881884591239001], ], name) self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name) self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name)
def test_multivariate(self): dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'multivariate_dataset_1', 'datasetDoc.json')) dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path)) # We set semantic types like runtime would. dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Target') dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Target') dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') random = numpy.random.RandomState(42) # Create a synthetic prediction DataFrame. d3mIndex = dataset['learningData'].iloc[:, 0].astype(int) amplitude = random.randn(len(d3mIndex)) lengthscale = random.randn(len(d3mIndex)) predictions = container.DataFrame({'d3mIndex': d3mIndex, 'amplitude': amplitude, 'lengthscale': lengthscale}, generate_metadata=True) shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() metrics_class = hyperparams_class.configuration['metrics'].elements primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'MEAN_SQUARED_ERROR', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'ROOT_MEAN_SQUARED_ERROR', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'MEAN_ABSOLUTE_ERROR', 'pos_label': None, 'k': None, })], })) for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): scores = primitive.produce(inputs=pred, score_dataset=dataset).value self.assertEqual(scores.values.tolist(), [ ['MEAN_SQUARED_ERROR', 1.7627871219522482, 0.9991186066672619], ['ROOT_MEAN_SQUARED_ERROR', 1.3243591896125282, 0.9993378205019783], ['MEAN_ABSOLUTE_ERROR', 1.043095768817859, 0.9994784521628801], ], name) self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name) self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name)
def test_object_detection_just_bounding_polygon(self): dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'object_dataset_1', 'datasetDoc.json')) dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path)) # We set semantic types like runtime would. dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Target') dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') random = numpy.random.RandomState(42) # Create a synthetic prediction DataFrame. predictions = container.DataFrame([ [0, '330,463,330,505,387,505,387,463', 0.0739], [0, '420,433,420,498,451,498,451,433', 0.091], [0, '328,465,328,540,403,540,403,465', 0.1008], [0, '480,477,480,522,508,522,508,477', 0.1012], [0, '357,460,357,537,417,537,417,460', 0.1058], [0, '356,456,356,521,391,521,391,456', 0.0843], [1, '345,460,345,547,415,547,415,460', 0.0539], [1, '381,362,381,513,455,513,455,362', 0.0542], [1, '382,366,382,422,416,422,416,366', 0.0559], [1, '730,463,730,583,763,583,763,463', 0.0588], ], columns=['d3mIndex', 'bounding_polygon_area', 'confidence'], generate_metadata=True) shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() metrics_class = hyperparams_class.configuration['metrics'].elements primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'OBJECT_DETECTION_AVERAGE_PRECISION', 'pos_label': None, 'k': None, })], })) for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): scores = primitive.produce(inputs=pred, score_dataset=dataset).value self.assertEqual(scores.values.tolist(), [ ['OBJECT_DETECTION_AVERAGE_PRECISION', 0.125, 0.125], ], name) self.assertEqual(scores.metadata.query_column(0)['name'], 'metric') self.assertEqual(scores.metadata.query_column(1)['name'], 'value') self.assertEqual(scores.metadata.query_column(2)['name'], 'normalized')
def test_classification(self): dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) dataset = container.Dataset.load(utils.path_to_uri(dataset_doc_path)) # We set semantic types like runtime would. dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') random = numpy.random.RandomState(42) # Create a synthetic prediction DataFrame. d3mIndex = dataset['learningData'].iloc[:, 0].astype(int) species = random.choice(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], len(d3mIndex)) predictions = container.DataFrame({'d3mIndex': d3mIndex, 'species': species}, generate_metadata=True) shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() metrics_class = hyperparams_class.configuration['metrics'].elements primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'ACCURACY', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'F1_MICRO', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'F1_MACRO', 'pos_label': None, 'k': None, })], })) for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): scores = primitive.produce(inputs=pred, score_dataset=dataset).value self.assertEqual(scores.values.tolist(), [ ['ACCURACY', 0.4066666666666667, 0.4066666666666667], ['F1_MICRO', 0.4066666666666667, 0.4066666666666667], ['F1_MACRO', 0.4051068540623797, 0.4051068540623797], ], name) self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name) self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name)
def crawl_openml_task( datasets: typing.Dict[str, str], task_id: int, save_dir: str, *, data_pipeline: pipeline_module.Pipeline, data_params: typing.Dict[str, str] = None, context: metadata_base.Context, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None, runtime_environment: pipeline_run_module.RuntimeEnvironment = None, dataset_resolver: typing.Callable = None, problem_resolver: typing.Callable = None, compute_digest: dataset_module.ComputeDigest = dataset_module. ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, ) -> None: """ A function that crawls an OpenML task and corresponding dataset, do the split using a data preparation pipeline, and stores the splits as D3M dataset and problem description. Parameters ---------- datasets: A mapping between known dataset IDs and their paths. Is updated in-place. task_id: An integer representing and OpenML task id to crawl and convert. save_dir: A directory where to save datasets and problems. data_pipeline: A data preparation pipeline used for splitting. data_params: A dictionary that contains the hyper-parameters for the data prepration pipeline. context: In which context to run pipelines. random_seed: A random seed to use for every run. This control all randomness during the run. volumes_dir: Path to a directory with static files required by primitives. scratch_dir: Path to a directory to store any temporary files needed during execution. runtime_environment: A description of the runtime environment. dataset_resolver: A dataset resolver to use. problem_resolver: A problem description resolver to use. compute_digest: Compute a digest over the data? strict_digest: If computed digest does not match the one provided in metadata, raise an exception? """ if dataset_resolver is None: dataset_resolver = dataset_module.get_dataset if problem_resolver is None: problem_resolver = problem_module.get_problem number_of_folds = runtime._get_number_of_folds(data_params) assert number_of_folds != 0 problem_uri = f'https://www.openml.org/t/{task_id}' problem_description = problem_resolver(problem_uri, strict_digest=strict_digest) if len(problem_description['inputs']) != 1: raise exceptions.NotSupportedError( "OpenML problem descriptions with multiple inputs are not supported." ) problem_description_input = problem_description['inputs'][0] input_dataset_id = problem_description_input['dataset_id'] known_datasets_set = set(datasets.keys()) needed_splits_set = set() # We make sure when splitting that the output dataset has the same ID as the input dataset # with additional suffix for split type, and we are taking the advantage of this here. # The naming scheme matches "runtime._get_split_dataset_id". if number_of_folds == 1: needed_splits_set.add(f'{input_dataset_id}_TRAIN') needed_splits_set.add(f'{input_dataset_id}_TEST') needed_splits_set.add(f'{input_dataset_id}_SCORE') dataset_view_maps = [{ 'train': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_TRAIN', }, ], 'test': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_TEST', }, ], 'score': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_SCORE', }, ], }] else: dataset_view_maps = [] for fold_index in range(number_of_folds): needed_splits_set.add( f'{input_dataset_id}_FOLD_{fold_index}_TRAIN') needed_splits_set.add(f'{input_dataset_id}_FOLD_{fold_index}_TEST') needed_splits_set.add( f'{input_dataset_id}_FOLD_{fold_index}_SCORE') dataset_view_maps.append({ 'train': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_FOLD_{fold_index}_TRAIN', }, ], 'test': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_FOLD_{fold_index}_TEST', }, ], 'score': [ { 'from': input_dataset_id, 'to': f'{input_dataset_id}_FOLD_{fold_index}_SCORE', }, ], }) # We already have this split, we can just reuse it. if problem_description_input[ 'dataset_id'] in known_datasets_set and needed_splits_set <= known_datasets_set: logger.debug("Copying existing splits.") # Copy splits. if number_of_folds == 1: view_maps = dataset_view_maps[0] for split_type in ['train', 'test', 'score']: shutil.copytree( os.path.dirname( datasets[runtime._get_dataset_id_from_view_maps( view_maps, split_type, input_dataset_id)]), os.path.join(save_dir, split_type.upper(), f'dataset_{split_type.upper()}'), ) # Save problem description for the split. We do not copy because we copy only datasets. problem_path = os.path.abspath( os.path.join(save_dir, split_type.upper(), f'problem_{split_type.upper()}', 'problemDoc.json')) runtime._save_problem_description(problem_description, problem_path, dataset_view_maps=view_maps) else: for fold_index, view_maps in enumerate(dataset_view_maps): for split_type in ['train', 'test', 'score']: shutil.copytree( os.path.dirname( datasets[runtime._get_dataset_id_from_view_maps( view_maps, split_type, input_dataset_id)]), os.path.join(save_dir, 'folds', str(fold_index), split_type.upper(), f'dataset_{split_type.upper()}'), ) # Save problem description for the split. We do not copy because we copy only datasets. problem_path = os.path.abspath( os.path.join(save_dir, 'folds', str(fold_index), split_type.upper(), f'problem_{split_type.upper()}', 'problemDoc.json')) runtime._save_problem_description( problem_description, problem_path, dataset_view_maps=view_maps) # Copy data preparation pipeline run pickle. shutil.copy2( os.path.join(os.path.dirname(datasets[input_dataset_id]), '..', runtime.DATA_PIPELINE_RUN_FILENAME), os.path.join(save_dir, runtime.DATA_PIPELINE_RUN_FILENAME), ) # Copy full dataset. shutil.copytree( os.path.dirname(datasets[input_dataset_id]), os.path.join(save_dir, input_dataset_id), ) else: logger.debug("Running a data preparation pipeline.") openml_dataset_id = int(input_dataset_id.split('_')[-1]) dataset_uri = f'https://www.openml.org/d/{openml_dataset_id}' dataset = dataset_resolver( dataset_uri, compute_digest=compute_digest, strict_digest=strict_digest, ) dataset_id = dataset.metadata.query_field((), 'id') if input_dataset_id != dataset_id: raise exceptions.InvalidDatasetError( f"Loaded dataset (\"{dataset_id}\") does not have the expected dataset ID (\"{input_dataset_id}\")." ) # Make splits and save them. This saves the pipeline run made by the data preparation pipeline, too. runtime.prepare_data_and_save( save_dir=save_dir, inputs=[dataset], data_pipeline=data_pipeline, problem_description=problem_description, data_params=data_params, context=context, random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir, runtime_environment=runtime_environment, # We provide "dataset_view_maps" to force split dataset IDs. dataset_view_maps=dataset_view_maps, ) # Save full dataset. dataset_path = os.path.abspath( os.path.join(save_dir, dataset_id, 'datasetDoc.json')) dataset_uri = utils.path_to_uri(dataset_path) dataset.save(dataset_uri) # Updating known datasets. datasets[dataset_id] = dataset_path # We make sure when splitting that the output dataset has the same ID as the input dataset # with additional suffix for split type, and we are taking the advantage of this here. # The naming scheme matches "runtime._get_split_dataset_id". if number_of_folds == 1: for split_type in ['TRAIN', 'TEST', 'SCORE']: datasets[f'{dataset_id}_{split_type}'] = os.path.join( save_dir, split_type, f'dataset_{split_type}', 'datasetDoc.json') else: for fold_index in range(number_of_folds): for split_type in ['TRAIN', 'TEST', 'SCORE']: datasets[ f'{dataset_id}_FOLD_{fold_index}_{split_type}'] = os.path.join( save_dir, 'folds', str(fold_index), split_type, f'dataset_{split_type}', 'datasetDoc.json') # Save problem description. For splits, problem description is saved by "runtime.prepare_data_and_save". problem_path = os.path.abspath( os.path.join(save_dir, problem_description['id'], 'problemDoc.json')) # We do not save "dataset_view_maps" for this problem description. runtime._save_problem_description(problem_description, problem_path)
def test_basic(self): self.maxDiff = None problem_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json')) problem_uri = utils.path_to_uri(problem_doc_path) problem_description = problem.Problem.load(problem_uri) self.assertEqual(problem_description.to_simple_structure(), { 'id': 'iris_problem_1', 'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5', 'version': '4.0.0', 'location_uris': [ problem_uri, ], 'name': 'Distinguish Iris flowers', 'description': 'Distinguish Iris flowers of three related species.', 'schema': problem.PROBLEM_SCHEMA_VERSION, 'problem': { 'task_keywords': [problem.TaskKeyword.CLASSIFICATION, problem.TaskKeyword.MULTICLASS], 'performance_metrics': [ { 'metric': problem.PerformanceMetric.ACCURACY, } ] }, 'inputs': [ { 'dataset_id': 'iris_dataset_1', 'targets': [ { 'target_index': 0, 'resource_id': 'learningData', 'column_index': 5, 'column_name': 'species', } ] } ], }) self.assertEqual(problem_description.to_json_structure(), { 'id': 'iris_problem_1', 'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5', 'version': '4.0.0', 'location_uris': [ problem_uri, ], 'name': 'Distinguish Iris flowers', 'description': 'Distinguish Iris flowers of three related species.', 'schema': problem.PROBLEM_SCHEMA_VERSION, 'problem': { 'task_keywords': [problem.TaskKeyword.CLASSIFICATION, problem.TaskKeyword.MULTICLASS], 'performance_metrics': [ { 'metric': problem.PerformanceMetric.ACCURACY, } ] }, 'inputs': [ { 'dataset_id': 'iris_dataset_1', 'targets': [ { 'target_index': 0, 'resource_id': 'learningData', 'column_index': 5, 'column_name': 'species', } ] } ], }) self.assertEqual(problem_description.to_json_structure(), { 'id': 'iris_problem_1', 'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5', 'version': '4.0.0', 'location_uris': [ problem_uri, ], 'name': 'Distinguish Iris flowers', 'description': 'Distinguish Iris flowers of three related species.', 'schema': problem.PROBLEM_SCHEMA_VERSION, 'problem': { 'task_keywords': ['CLASSIFICATION', 'MULTICLASS'], 'performance_metrics': [ { 'metric': 'ACCURACY', } ] }, 'inputs': [ { 'dataset_id': 'iris_dataset_1', 'targets': [ { 'target_index': 0, 'resource_id': 'learningData', 'column_index': 5, 'column_name': 'species', } ] } ], }) pipeline_run.validate_problem(problem_description.to_json_structure(canonical=True)) problem.PROBLEM_SCHEMA_VALIDATOR.validate(problem_description.to_json_structure(canonical=True))