def _get_inputs(self, problem, rinputs): inputs = [] for ip in rinputs: dataset = None if ip.HasField("dataset_uri") == True: dataset = D3MDatasetLoader().load(ip.dataset_uri) elif ip.HasField("csv_uri") == True: data = pd.read_csv( ip.csv_uri, dtype=str, header=0, na_filter=False, encoding='utf8', low_memory=False, ) dataset = container.DataFrame(data) logging.critical("Problem %s", problem) if len(problem.inputs) > 0: targets = problem.inputs[0].targets dataset = util.add_target_metadata(dataset, targets) dataset = util.add_privileged_metadata( dataset, problem.inputs[0].privileged_data) inputs.append(dataset) return inputs
def load_data_problem(inputdir, problempath): print("Reading ", inputdir) print("Reading ", problempath) with open(problempath) as file: problem_schema = json.load(file) #filename = "scores.csv" #with open(filename, "a") as g: # g.write(inputdir + "\n") datasetId = problempath[:-29] dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json" problem_doc_metadata = Metadata(problem_schema) dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema) dataset = D3MDatasetLoader().load(dataset_uri) problem_description = problem.parse_problem_description(problempath) dataset = add_target_columns_metadata(dataset, problem_description) dataset = add_privileged_columns_metadata(dataset, problem_description) taskname = get_task_name(problem_doc_metadata.query(())['about']['taskKeywords']) metric = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['metric'] posLabel = None if metric == "f1": posLabel = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['posLabel'] # Read the data augmentation keywords = getAugmentation_keywords(problem_doc_metadata) return (dataset, taskname, problem_description, metric, posLabel, keywords)
def convert_dataset_uri_to_dataset(problem_doc, dataset_uri, mode="train"): if "file://" in dataset_uri: dataset_uri = dataset_uri[len("file://"):] # Dataset dataset_uri = 'file://{dataset_uri}'.format( dataset_uri=os.path.abspath(dataset_uri)) dataset = D3MDatasetLoader() dataset = dataset.load(dataset_uri=dataset_uri) dataset = add_target_columns_metadata(dataset, problem_doc) if mode == "train" or mode == "score": dataset = add_target_columns_metadata(dataset, problem_doc) return dataset
def load_data(data_path, problem_path) -> tuple: ''' load dataset metadata ''' dataset = D3MDatasetLoader() if "file:" not in data_path: data_path = 'file://{dataset_path}'.format( dataset_path=os.path.abspath(data_path)) with open(problem_path) as f: problem_doc = json.load(f) problem = Metadata(problem_doc) dataset = dataset.load(dataset_uri=data_path) dataset = add_target_columns_metadata(dataset, problem) return dataset, problem
def generate_pipeline(pipeline_path: str, dataset_path: str, problem_doc_path: str, resolver: Resolver = None) -> Runtime: """ Simplified interface that fit a pipeline with a dataset Paramters --------- pipeline_path Path to the pipeline description dataset_path: Path to the datasetDoc.json problem_doc_path: Path to the problemDoc.json resolver : Resolver Resolver to use. """ # Pipeline description pipeline_description = None if '.json' in pipeline_path: with open(pipeline_path) as pipeline_file: pipeline_description = Pipeline.from_json( string_or_file=pipeline_file, resolver=resolver) else: with open(pipeline_path) as pipeline_file: pipeline_description = Pipeline.from_yaml( string_or_file=pipeline_file, resolver=resolver) # Problem Doc problem_doc = load_problem_doc(problem_doc_path) # Dataset if 'file:' not in dataset_path: dataset_path = 'file://{dataset_path}'.format( dataset_path=os.path.abspath(dataset_path)) dataset = D3MDatasetLoader().load(dataset_uri=dataset_path) # Adding Metadata to Dataset dataset = add_target_columns_metadata(dataset, problem_doc) # Pipeline pipeline_runtime = Runtime(pipeline_description) # Fitting Pipeline pipeline_runtime.fit(inputs=[dataset]) return pipeline_runtime
def load_test_dataset_for_pipeline(config_path) -> tuple: ''' load and return test_dataset and test_problem given by configfile: test_config.json ''' test_config_path = os.path.join(config_path, "test_config.json") with open(test_config_path, "r") as f: test_config = json.load(f) data_path = test_config["dataset_schema"] problem_path = test_config["problem_schema"] dataset = D3MDatasetLoader() if "file:" not in data_path: data_path = 'file://{dataset_path}'.format( dataset_path=os.path.abspath(data_path)) with open(problem_path) as f: problem_doc = json.load(f) problem = Metadata(problem_doc) dataset = dataset.load(dataset_uri=data_path) dataset = add_target_columns_metadata(dataset, problem) return dataset, problem
def test_pipeline(pipeline_runtime: Runtime, dataset_path: str) -> typing.List: """ Simplified interface test a pipeline with a dataset Paramters --------- pipeline_runtime Runtime object dataset_path: Path to the datasetDoc.json """ # Dataset if 'file:' not in dataset_path: dataset_path = 'file://{dataset_path}'.format( dataset_path=os.path.abspath(dataset_path)) dataset = D3MDatasetLoader().load(dataset_uri=dataset_path) return pipeline_runtime.produce(inputs=[dataset])
def load_data_problem(inputdir, problempath): print("Reading ", inputdir) print("Reading ", problempath) with open(problempath) as file: problem_schema = json.load(file) datasetId = problempath[:-29] dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json" problem_doc_metadata = Metadata(problem_schema) dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema) dataset = D3MDatasetLoader().load(dataset_uri) problem_description = problem.parse_problem_description(problempath) dataset = add_target_columns_metadata(dataset, problem_description) taskname = problem_doc_metadata.query(())['about']['taskType'] return (dataset, taskname, problem_description)
def load_d3m_dataset(path) -> typing.Optional[d3m_Dataset]: """ Function used to load d3m datasets """ # creat a dict which have reference for all dataset ids logger.debug("Trying to load dataset " + str(path)) datasets_list = dict() for each_path in dataset_paths: temp = os.listdir(each_path) for each in temp: datasets_list[each] = each_path if path not in datasets_list.keys(): return None loader = D3MDatasetLoader() dataset_path = os.path.join(datasets_list[path], path, path + "_dataset", "datasetDoc.json") json_file = os.path.abspath(dataset_path) all_dataset_uri = 'file://{}'.format(json_file) all_dataset = loader.load(dataset_uri=all_dataset_uri) logger.debug("Load " + str(path) + " success!") return all_dataset
def load_data_from_dir(inputdir, mode="train"): """ Returns problem_doc and dataset given input directory. """ assert mode in ["train", "test", "score"] if mode == "train": problemdir = "%s/TRAIN/problem_TRAIN" % inputdir problem_doc_uri = "%s/problemDoc.json" % problemdir datasetdir = "%s/TRAIN/dataset_TRAIN" % inputdir dataset_uri = "%s/datasetDoc.json" % datasetdir if mode == "test": problemdir = "%s/TEST/problem_TEST" % inputdir problem_doc_uri = "%s/problemDoc.json" % problemdir datasetdir = "%s/TEST/dataset_TEST" % inputdir dataset_uri = "%s/datasetDoc.json" % datasetdir if mode == "score": problemdir = "%s/SCORE/problem_TEST" % inputdir problem_doc_uri = "%s/problemDoc.json" % problemdir datasetdir = "%s/SCORE/dataset_TEST" % inputdir dataset_uri = "%s/datasetDoc.json" % datasetdir # Problem doc and dataset loading problem_doc = load_problem_doc(problem_doc_uri) # Dataset if 'file:' not in dataset_uri: dataset_uri = 'file://{dataset_uri}'.format( dataset_uri=os.path.abspath(dataset_uri)) dataset = D3MDatasetLoader() dataset = dataset.load(dataset_uri=dataset_uri) dataset = add_target_columns_metadata(dataset, problem_doc) if mode == "train" or mode == "score": dataset = add_target_columns_metadata(dataset, problem_doc) return problem_doc, dataset
h_DD =hyper_DD.defaults() h_attr = {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',),'use_columns': (), 'exclude_columns': ()} h_target = {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Target','https://metadata.datadrivendiscovery.org/types/SuggestedTarget',), 'use_columns': (), 'exclude_columns': ()} primitive_0 = Denormalize(hyperparams=h_DE) primitive_1 = DatasetToDataFramePrimitive(hyperparams=h_DD) primitive_3 = ExtractColumnsBySemanticTypesPrimitive(hyperparams=h_attr) primitive_4 = ExtractColumnsBySemanticTypesPrimitive(hyperparams=h_target) # global variables dataset_file_path = "dsbox/unit_tests/resources/38_sick_data/datasetDoc.json" dataset = D3MDatasetLoader() dataset = dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=os.path.abspath(dataset_file_path))) result0 = primitive_0.produce(inputs=dataset) result1 = primitive_1.produce(inputs=result0.value) X = primitive_3.produce(inputs=result1.value).value Y = primitive_4.produce(inputs=result1.value).value hp = GreedyHyperparameter.sample() import unittest class TestGreedy(unittest.TestCase): def setUp(self):
# initialize from datamart.entries import Datamart from d3m.container.dataset import Dataset, D3MDatasetLoader from common_primitives.denormalize import Hyperparams as hyper_denormalize, DenormalizePrimitive from d3m.base import utils as d3m_utils import os import pandas as pd # load the ISI datamart, currently the url is here, may change in the future isi_datamart_url = "http://dsbox02.isi.edu:9999/blazegraph/namespace/datamart3/sparql" a = Datamart(connection_url=isi_datamart_url) # load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path loader = D3MDatasetLoader() path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json" json_file = os.path.abspath(path) all_dataset_uri = 'file://{}'.format(json_file) all_dataset = loader.load(dataset_uri=all_dataset_uri) # run denormlaize primitive denormalize_hyperparams = hyper_denormalize.defaults() denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams) all_dataset = denormalize_primitive.produce(inputs = all_dataset).value """ start search, run search with data function. Here because the dataset do not have any "Text" semantic type columns, the system will said that no columns can be augment """ search_res = a.search_with_data(query=None, supplied_data=all_dataset) """
from d3m.container.dataset import Dataset, D3MDatasetLoader from d3m.metadata import base as metadata_base, pipeline as pipeline_module, problem from d3m.runtime import Runtime from d3m import index problem_path = 'problem1/problemDoc.json' dataset_train_path = 'problem1/train/datasetDoc.json' dataset_predict_path = 'problem1/predict/datasetDoc.json' pipeline_path = 'pipeline.json' # Loading problem description. problem_description = problem.parse_problem_description(problem_path) # Loading dataset. path = 'file://{uri}'.format(uri=os.path.abspath(dataset_train_path)) dataset = D3MDatasetLoader().load(dataset_uri=path) path2 = 'file://{uri}'.format(uri=os.path.abspath(dataset_predict_path)) dataset_predict = D3MDatasetLoader().load(dataset_uri=path2) # Loading pipeline description file. with open(pipeline_path, 'r') as file: pipeline_description = pipeline_module.Pipeline.from_json( string_or_file=file) # Creating an instance on runtime with pipeline description and problem description. runtime = Runtime(pipeline=pipeline_description, problem_description=problem_description, context=metadata_base.Context.TESTING) # Fitting pipeline on input dataset.