Beispiel #1
0
    def _get_inputs(self, problem, rinputs):
        inputs = []

        for ip in rinputs:
            dataset = None
            if ip.HasField("dataset_uri") == True:
                dataset = D3MDatasetLoader().load(ip.dataset_uri)
            elif ip.HasField("csv_uri") == True:
                data = pd.read_csv(
                    ip.csv_uri,
                    dtype=str,
                    header=0,
                    na_filter=False,
                    encoding='utf8',
                    low_memory=False,
                )
                dataset = container.DataFrame(data)

            logging.critical("Problem %s", problem)
            if len(problem.inputs) > 0:
                targets = problem.inputs[0].targets
                dataset = util.add_target_metadata(dataset, targets)
                dataset = util.add_privileged_metadata(
                    dataset, problem.inputs[0].privileged_data)
            inputs.append(dataset)

        return inputs
Beispiel #2
0
def load_data_problem(inputdir, problempath):
    print("Reading ", inputdir)
    print("Reading ", problempath)

    with open(problempath) as file:
        problem_schema =  json.load(file)

    #filename = "scores.csv"
    #with open(filename, "a") as g:
    #    g.write(inputdir + "\n")
 
    datasetId = problempath[:-29]
    dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json"
    problem_doc_metadata = Metadata(problem_schema)
    dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema)
    dataset = D3MDatasetLoader().load(dataset_uri)

    problem_description = problem.parse_problem_description(problempath)
    dataset = add_target_columns_metadata(dataset, problem_description)
    dataset = add_privileged_columns_metadata(dataset, problem_description)
    taskname = get_task_name(problem_doc_metadata.query(())['about']['taskKeywords'])
    metric = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['metric']
    posLabel = None
    if metric == "f1":
        posLabel = problem_doc_metadata.query(())['inputs']['performanceMetrics'][0]['posLabel']

    # Read the data augmentation
    keywords = getAugmentation_keywords(problem_doc_metadata)
  
    return (dataset, taskname, problem_description, metric, posLabel, keywords)
def convert_dataset_uri_to_dataset(problem_doc, dataset_uri, mode="train"):
    if "file://" in dataset_uri:
        dataset_uri = dataset_uri[len("file://"):]

    # Dataset
    dataset_uri = 'file://{dataset_uri}'.format(
        dataset_uri=os.path.abspath(dataset_uri))
    dataset = D3MDatasetLoader()
    dataset = dataset.load(dataset_uri=dataset_uri)
    dataset = add_target_columns_metadata(dataset, problem_doc)
    if mode == "train" or mode == "score":
        dataset = add_target_columns_metadata(dataset, problem_doc)
    return dataset
Beispiel #4
0
def load_data(data_path, problem_path) -> tuple:
    '''
    load dataset metadata
    '''
    dataset = D3MDatasetLoader()
    if "file:" not in data_path:
        data_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(data_path))
    with open(problem_path) as f:
        problem_doc = json.load(f)
        problem = Metadata(problem_doc)
    dataset = dataset.load(dataset_uri=data_path)
    dataset = add_target_columns_metadata(dataset, problem)
    return dataset, problem
Beispiel #5
0
def generate_pipeline(pipeline_path: str,
                      dataset_path: str,
                      problem_doc_path: str,
                      resolver: Resolver = None) -> Runtime:
    """
    Simplified interface that fit a pipeline with a dataset

    Paramters
    ---------
    pipeline_path
        Path to the pipeline description
    dataset_path:
        Path to the datasetDoc.json
    problem_doc_path:
        Path to the problemDoc.json
    resolver : Resolver
        Resolver to use.
    """

    # Pipeline description
    pipeline_description = None
    if '.json' in pipeline_path:
        with open(pipeline_path) as pipeline_file:
            pipeline_description = Pipeline.from_json(
                string_or_file=pipeline_file, resolver=resolver)
    else:
        with open(pipeline_path) as pipeline_file:
            pipeline_description = Pipeline.from_yaml(
                string_or_file=pipeline_file, resolver=resolver)

    # Problem Doc
    problem_doc = load_problem_doc(problem_doc_path)

    # Dataset
    if 'file:' not in dataset_path:
        dataset_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(dataset_path))

    dataset = D3MDatasetLoader().load(dataset_uri=dataset_path)
    # Adding Metadata to Dataset
    dataset = add_target_columns_metadata(dataset, problem_doc)

    # Pipeline
    pipeline_runtime = Runtime(pipeline_description)
    # Fitting Pipeline
    pipeline_runtime.fit(inputs=[dataset])
    return pipeline_runtime
def load_test_dataset_for_pipeline(config_path) -> tuple:
    '''
    load and return test_dataset and test_problem given by configfile: test_config.json
    '''
    test_config_path = os.path.join(config_path, "test_config.json")
    with open(test_config_path, "r") as f:
        test_config = json.load(f)
        data_path = test_config["dataset_schema"]
        problem_path = test_config["problem_schema"]
    dataset = D3MDatasetLoader()
    if "file:" not in data_path:
        data_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(data_path))
    with open(problem_path) as f:
        problem_doc = json.load(f)
        problem = Metadata(problem_doc)
    dataset = dataset.load(dataset_uri=data_path)
    dataset = add_target_columns_metadata(dataset, problem)
    return dataset, problem
Beispiel #7
0
def test_pipeline(pipeline_runtime: Runtime, dataset_path: str) -> typing.List:
    """
    Simplified interface test a pipeline with a dataset

    Paramters
    ---------
    pipeline_runtime
        Runtime object
    dataset_path:
        Path to the datasetDoc.json
    """

    # Dataset
    if 'file:' not in dataset_path:
        dataset_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(dataset_path))
    dataset = D3MDatasetLoader().load(dataset_uri=dataset_path)

    return pipeline_runtime.produce(inputs=[dataset])
Beispiel #8
0
def load_data_problem(inputdir, problempath):
    print("Reading ", inputdir)
    print("Reading ", problempath)

    with open(problempath) as file:
        problem_schema = json.load(file)

    datasetId = problempath[:-29]
    dataset_schema = datasetId + "dataset_TRAIN/datasetDoc.json"
    problem_doc_metadata = Metadata(problem_schema)
    dataset_uri = 'file://{dataset_uri}'.format(dataset_uri=dataset_schema)
    dataset = D3MDatasetLoader().load(dataset_uri)

    problem_description = problem.parse_problem_description(problempath)
    dataset = add_target_columns_metadata(dataset, problem_description)

    taskname = problem_doc_metadata.query(())['about']['taskType']

    return (dataset, taskname, problem_description)
Beispiel #9
0
def load_d3m_dataset(path) -> typing.Optional[d3m_Dataset]:
    """
    Function used to load d3m datasets
    """
    # creat a dict which have reference for all dataset ids
    logger.debug("Trying to load dataset " + str(path))
    datasets_list = dict()
    for each_path in dataset_paths:
        temp = os.listdir(each_path)
        for each in temp:
            datasets_list[each] = each_path

    if path not in datasets_list.keys():
        return None
    loader = D3MDatasetLoader()
    dataset_path = os.path.join(datasets_list[path], path, path + "_dataset",
                                "datasetDoc.json")
    json_file = os.path.abspath(dataset_path)
    all_dataset_uri = 'file://{}'.format(json_file)
    all_dataset = loader.load(dataset_uri=all_dataset_uri)
    logger.debug("Load " + str(path) + " success!")
    return all_dataset
def load_data_from_dir(inputdir, mode="train"):
    """
    Returns problem_doc and dataset given input directory.
    """
    assert mode in ["train", "test", "score"]

    if mode == "train":
        problemdir = "%s/TRAIN/problem_TRAIN" % inputdir
        problem_doc_uri = "%s/problemDoc.json" % problemdir
        datasetdir = "%s/TRAIN/dataset_TRAIN" % inputdir
        dataset_uri = "%s/datasetDoc.json" % datasetdir
    if mode == "test":
        problemdir = "%s/TEST/problem_TEST" % inputdir
        problem_doc_uri = "%s/problemDoc.json" % problemdir
        datasetdir = "%s/TEST/dataset_TEST" % inputdir
        dataset_uri = "%s/datasetDoc.json" % datasetdir
    if mode == "score":
        problemdir = "%s/SCORE/problem_TEST" % inputdir
        problem_doc_uri = "%s/problemDoc.json" % problemdir
        datasetdir = "%s/SCORE/dataset_TEST" % inputdir
        dataset_uri = "%s/datasetDoc.json" % datasetdir

    # Problem doc and dataset loading
    problem_doc = load_problem_doc(problem_doc_uri)

    # Dataset
    if 'file:' not in dataset_uri:
        dataset_uri = 'file://{dataset_uri}'.format(
            dataset_uri=os.path.abspath(dataset_uri))
        dataset = D3MDatasetLoader()
        dataset = dataset.load(dataset_uri=dataset_uri)
        dataset = add_target_columns_metadata(dataset, problem_doc)
        if mode == "train" or mode == "score":
            dataset = add_target_columns_metadata(dataset, problem_doc)

    return problem_doc, dataset
Beispiel #11
0
h_DD =hyper_DD.defaults()

h_attr = {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',),'use_columns': (), 'exclude_columns': ()}
h_target = {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Target','https://metadata.datadrivendiscovery.org/types/SuggestedTarget',), 'use_columns': (), 'exclude_columns': ()}

primitive_0 = Denormalize(hyperparams=h_DE)
primitive_1 = DatasetToDataFramePrimitive(hyperparams=h_DD)


primitive_3 = ExtractColumnsBySemanticTypesPrimitive(hyperparams=h_attr)
primitive_4 = ExtractColumnsBySemanticTypesPrimitive(hyperparams=h_target)

# global variables
dataset_file_path = "dsbox/unit_tests/resources/38_sick_data/datasetDoc.json"

dataset = D3MDatasetLoader()
dataset = dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=os.path.abspath(dataset_file_path)))

result0 = primitive_0.produce(inputs=dataset)
result1 = primitive_1.produce(inputs=result0.value)

X = primitive_3.produce(inputs=result1.value).value
Y = primitive_4.produce(inputs=result1.value).value

hp = GreedyHyperparameter.sample()

import unittest


class TestGreedy(unittest.TestCase):
    def setUp(self):
Beispiel #12
0
# initialize
from datamart.entries import Datamart
from d3m.container.dataset import Dataset, D3MDatasetLoader
from common_primitives.denormalize import Hyperparams as hyper_denormalize, DenormalizePrimitive
from d3m.base import utils as d3m_utils
import os
import pandas as pd

# load the ISI datamart, currently the url is here, may change in the future
isi_datamart_url = "http://dsbox02.isi.edu:9999/blazegraph/namespace/datamart3/sparql"
a = Datamart(connection_url=isi_datamart_url)
# load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path
loader = D3MDatasetLoader()
path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
json_file = os.path.abspath(path)
all_dataset_uri = 'file://{}'.format(json_file)
all_dataset = loader.load(dataset_uri=all_dataset_uri)
# run denormlaize primitive
denormalize_hyperparams = hyper_denormalize.defaults()
denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams)
all_dataset = denormalize_primitive.produce(inputs = all_dataset).value


"""
start search, run search with data function.
Here because the dataset do not have any "Text" semantic type columns,
the system will said that no columns can be augment
"""
search_res = a.search_with_data(query=None, supplied_data=all_dataset)

"""
Beispiel #13
0
from d3m.container.dataset import Dataset, D3MDatasetLoader
from d3m.metadata import base as metadata_base, pipeline as pipeline_module, problem
from d3m.runtime import Runtime
from d3m import index

problem_path = 'problem1/problemDoc.json'
dataset_train_path = 'problem1/train/datasetDoc.json'
dataset_predict_path = 'problem1/predict/datasetDoc.json'
pipeline_path = 'pipeline.json'

# Loading problem description.
problem_description = problem.parse_problem_description(problem_path)

# Loading dataset.
path = 'file://{uri}'.format(uri=os.path.abspath(dataset_train_path))
dataset = D3MDatasetLoader().load(dataset_uri=path)

path2 = 'file://{uri}'.format(uri=os.path.abspath(dataset_predict_path))
dataset_predict = D3MDatasetLoader().load(dataset_uri=path2)

# Loading pipeline description file.
with open(pipeline_path, 'r') as file:
    pipeline_description = pipeline_module.Pipeline.from_json(
        string_or_file=file)

# Creating an instance on runtime with pipeline description and problem description.
runtime = Runtime(pipeline=pipeline_description,
                  problem_description=problem_description,
                  context=metadata_base.Context.TESTING)

# Fitting pipeline on input dataset.