Ejemplo n.º 1
0
    def __init__(self):

        pipeline_description = Pipeline()
        pipeline_description.add_input(name="inputs")

        # Ts formatter
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter"
        ))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="inputs.0",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # DS to DF on formatted ts DS
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.dataset_to_dataframe.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Grouping Field Compose
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.grouping_field_compose.Common")
                             )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Storc primitive -> KMeans
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.clustering.k_means.Sloth"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.2.produce",
        )
        step.add_hyperparameter(name="nclusters",
                                argument_type=ArgumentType.VALUE,
                                data=3)
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Final Output
        pipeline_description.add_output(name="output predictions",
                                        data_reference="steps.3.produce")

        self.pipeline = pipeline_description
Ejemplo n.º 2
0
    def __init__(self, epochs: int = 10, n_steps: int = 20):

        pipeline_description = Pipeline()
        pipeline_description.add_input(name="inputs")

        # Denormalize primitive
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.denormalize.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="inputs.0",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # DS to DF on input DS
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.dataset_to_dataframe.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # RetinaNet primitive
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.object_detection.retina_net.ObjectDetectionRN"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_argument(
            name="outputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_hyperparameter(name="n_epochs",
                                argument_type=ArgumentType.VALUE,
                                data=epochs)
        step.add_hyperparameter(name="n_steps",
                                argument_type=ArgumentType.VALUE,
                                data=n_steps)
        step.add_hyperparameter(name="weights_path",
                                argument_type=ArgumentType.VALUE,
                                data="/scratch_dir/")
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Final Output
        pipeline_description.add_output(name="output predictions",
                                        data_reference="steps.2.produce")

        self.pipeline = pipeline_description
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline()
        pipeline.add_input(name='inputs')

        # Step 0: dataset_to_dataframe
        step_0 = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: dataset_to_dataframe
        step_1 = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
        step_1.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_1.add_hyperparameter(name='dataframe_resource',
                                  argument_type=ArgumentType.VALUE,
                                  data='1')
        step_1.add_output('produce')
        pipeline.add_step(step_1)

        # Step 2: dataset_to_dataframe
        step_2 = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_2.add_hyperparameter(name='dataframe_resource',
                                  argument_type=ArgumentType.VALUE,
                                  data='2')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=EuclideanNomination.metadata.query())
        step_3.add_argument(name='inputs_1',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_3.add_argument(name='inputs_2',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_argument(name='reference',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.0.produce')

        step_3.add_output('produce')
        pipeline.add_step(step_3)

        # Adding output step to the pipeline
        pipeline.add_output(name='Predictions',
                            data_reference='steps.3.produce')

        return pipeline
Ejemplo n.º 4
0
    def test_register(self):
        FooBarPrimitive = create_primitive(
            'e2fc24f8-5b32-4759-be5b-8126a42522a3',
            'd3m.primitives.foo.bar.FooBarPrimitive')

        # To hide any logging or stdout output.
        with self.assertLogs(level=logging.DEBUG) as cm:
            with utils.redirect_to_logging():
                index.register_primitive(
                    'd3m.primitives.foo.bar.FooBarPrimitive', FooBarPrimitive)

                # Just to log something, otherwise "assertLogs" can fail.
                logging.getLogger().debug("Start test.")

        index.get_primitive('d3m.primitives.foo.bar.FooBarPrimitive')
Ejemplo n.º 5
0
    def ListPrimitives(self, request, context):
        '''
    List all primitives known to TA2, their IDs, versions, names, and digests. Using this
    information a TA3 should know which primitives may be put into a pipeline template.
    To narrow down potential primitives to use a TA3 can also ask a TA2 to do a solution
    search and then observe which primitives the TA2 is using. If more metadata about primitives
    is needed, then a TA3 can use the results of this call to map primitives to metadata
    (from Python code or primitive annotations) on its own.
    '''

        list_primitives = []
        source_primitives = []
        primitives = index.search()

        for prim in primitives:
            try:
                p = index.get_primitive(prim)
                source_primitives.append(p)
            except:
                0

        for p in source_primitives:
            meta = p.metadata.to_json_structure()
            list_primitives.append(
                primitive_pb2.Primitive(id=meta['id'],
                                        version=meta['version'],
                                        python_path=meta['python_path'],
                                        name=meta['name'],
                                        digest=meta['digest']))
        return core_pb2.ListPrimitivesResponse(primitives=list_primitives)
Ejemplo n.º 6
0
    def test_entrypoint(self):
        working_set_entries = copy.copy(pkg_resources.working_set.entries)
        working_set_entry_keys = copy.copy(
            pkg_resources.working_set.entry_keys)
        working_set_by_key = copy.copy(pkg_resources.working_set.by_key)

        try:
            distribution = pkg_resources.Distribution(__file__)
            entry_point = pkg_resources.EntryPoint.parse(
                'foo2.bar2.FooBar2Primitive = test_index:FooBar2Primitive',
                dist=distribution)
            distribution._ep_map = {
                'd3m.primitives': {
                    'foo2.bar2.FooBar2Primitive': entry_point
                }
            }
            pkg_resources.working_set.add(distribution)

            python_path = 'd3m.primitives.foo2.bar2.FooBar2Primitive'

            self.assertIn(python_path, index.search())

            self.assertIs(index.get_primitive(python_path), FooBar2Primitive)

        finally:
            pkg_resources.working_set.entries = working_set_entries
            pkg_resources.working_set.entry_keys = working_set_entry_keys
            pkg_resources.working_set.by_key = working_set_by_key
Ejemplo n.º 7
0
def available_primitives():
    primitives_info = []

    with d3m_utils.silence():
        for primitive_path in d3m_index.search():
            if primitive_path in PrimitivesList.BlockList:
                continue

            try:
                primitive = d3m_index.get_primitive(primitive_path)
                primitive_id = primitive.metadata.query()['id']
                version = primitive.metadata.query()['version']
                python_path = primitive.metadata.query()['python_path']
                name = primitive.metadata.query()['name']
                digest = primitive.metadata.query().get('digest', None)
                primitive_info = {
                    'id': primitive_id,
                    'version': version,
                    'python_path': python_path,
                    'name': name,
                    'digest': digest
                }
                primitives_info.append(primitive_info)
            except:
                continue
    return primitives_info
Ejemplo n.º 8
0
    def __init__(self, ):

        pipeline_description = Pipeline()
        pipeline_description.add_input(name="inputs")

        # DS to DF on input DS
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.dataset_to_dataframe.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="inputs.0",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # column parser
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.column_parser.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Duke primitive
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_cleaning.text_summarization.Duke"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Final Output
        pipeline_description.add_output(name="output predictions",
                                        data_reference="steps.2.produce")

        self.pipeline = pipeline_description
Ejemplo n.º 9
0
    def get_dataframe(self, input_data):
        # denormalize
        denormalize = index.get_primitive(
            'd3m.primitives.data_transformation.denormalize.Common')
        hyperparams_class = denormalize.metadata.get_hyperparams()
        primitive = denormalize(hyperparams=hyperparams_class.defaults())
        dataset = primitive.produce(inputs=input_data[0]).value

        # Add Target column into dataset
        dataset = self.mark_columns(dataset)

        # dataset to dataframe
        dataset_dataframe = index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common')
        hyperparams_class = dataset_dataframe.metadata.get_hyperparams()
        primitive = dataset_dataframe(hyperparams=hyperparams_class.defaults())
        dataframe = primitive.produce(inputs=dataset).value

        return dataframe
Ejemplo n.º 10
0
 def class_hyperparameter_generator(primitive_name, parameter_name,
                                    definition):
     from d3m import index
     g = None
     try:
         g = index.get_primitive(primitive_name).metadata.query(
         )["primitive_code"]["hyperparams"][parameter_name][
             'structural_type'](definition)
     except Exception:
         _logger.error(f"Hyperparameter not valid for {primitive_name}!")
         pass
     return g
Ejemplo n.º 11
0
def get_preprocessor(input_data, problem, treatment):
    metadata = input_data.metadata
    task_description = schemas_utils.get_task_description(
        problem['problem']['task_keywords'])
    task_type = task_description['task_type']
    semi = task_description['semi']
    data_types = task_description['data_types']
    task = pipeline_utils.infer_primitive_family(task_type=task_type,
                                                 data_types=data_types,
                                                 is_semi=semi)
    main_resource = pipeline_utils.get_tabular_resource_id(dataset=input_data)

    # Loading primitives
    primitives = {
        'DatasetToDataFrame':
        'd3m.primitives.data_transformation.dataset_to_dataframe.Common',
        'ColumnParser':
        'd3m.primitives.data_transformation.column_parser.Common',
        'ExtractColumnsBySemanticTypes':
        'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common',
        'Denormalize':
        'd3m.primitives.data_transformation.denormalize.Common',
        'Imputer':
        'd3m.primitives.data_cleaning.imputer.SKlearn',
        'SimpleProfiler':
        'd3m.primitives.schema_discovery.profiler.Common',
        'TextEncoder':
        'd3m.primitives.data_transformation.encoder.DistilTextEncoder',
    }
    loaded_primitives = dict()

    try:
        for primitive_name in primitives.keys():
            loaded_primitives[primitive_name] = index.get_primitive(
                primitives[primitive_name])
    except Exception as e:
        print("Cannot load primitive {}".format(e))

    candidates = []
    for preprocessor in preprocessors:
        if preprocessor.check_task_treatment(task, treatment) \
                and preprocessor.check_expected_data_types(data_types) \
                and preprocessor.check_unsupported_data_types(data_types):
            candidates.append(
                preprocessor(metadata, main_resource, data_types,
                             loaded_primitives, problem))
    if not candidates:
        candidates.append(
            TabularPreprocessor(metadata, main_resource, data_types,
                                loaded_primitives))
    return candidates
Ejemplo n.º 12
0
def load_hyperparameters(primitive_name):
    primitive = index.get_primitive(primitive_name)
    hyperparameters_metadata = primitive.metadata.query(
    )['primitive_code']['hyperparams']
    hyperparameter_class = typing.get_type_hints(
        primitive.__init__)['hyperparams']
    hyperparameters = {}

    if hyperparameter_class:
        for hp_name, hp_value in hyperparameter_class.configuration.items():
            if 'https://metadata.datadrivendiscovery.org/types/TuningParameter' in hyperparameters_metadata[
                    hp_name]['semantic_types']:
                hyperparameters[hp_name] = hp_value

    return hyperparameters
Ejemplo n.º 13
0
 def get_x(self, dataframe):
     # reading images
     image_reader = index.get_primitive(
         'd3m.primitives.data_preprocessing.image_reader.Common')
     hyperparams_class = image_reader.metadata.get_hyperparams()
     primitive = image_reader(hyperparams=hyperparams_class.defaults().
                              replace({'return_result': 'replace'}))
     columns_to_use = primitive._get_columns(dataframe.metadata)
     column_index = columns_to_use[0]
     temp = [
         primitive._read_filename(
             column_index,
             dataframe.metadata.query((row_index, column_index)), value)
         for row_index, value in enumerate(dataframe.iloc[:, column_index])
     ]
     x = np.array(temp, dtype=np.float64)
     return x
Ejemplo n.º 14
0
 def get_y(self, dataframe):
     # extract targets
     get_columns_semantic = index.get_primitive(
         'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'
     )
     hyperparams_class = get_columns_semantic.metadata.get_hyperparams()
     primitive = get_columns_semantic(hyperparams=hyperparams_class.defaults(
     ).replace({
         'semantic_types':
         ('https://metadata.datadrivendiscovery.org/types/TrueTarget',
          'https://metadata.datadrivendiscovery.org/types/Target',
          'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
          'https://metadata.datadrivendiscovery.org/types/PredictedTarget')
     }))
     targets = primitive.produce(inputs=dataframe).value
     y = np.array(targets, dtype=np.int64)
     return y
Ejemplo n.º 15
0
def update_pipeline(
    pipeline_to_update, filename=None
):
    """
    This function updates the pipeline's digests and version numbers

    Parameters
    ----------
    pipeline_json_structure: the pipeline in JSON form (WITHOUT) digests.  This or the `filename` parameter is mandatory
    filename: the filename of the pipeline json, so we can read it in

    :return a pipeline with updated digests
    """
    if pipeline_to_update is None and filename is None:
        raise ValueError("No pipeline json was given")
    elif pipeline_to_update is None:
        with open(filename, "r") as file:
            # NOTE: must be a pipeline with no digests, or recent digests
            # NOTE: reading this in as straight JSON doesn't work so we have to use the pipeline_module
            pipeline_to_update = pipeline_module.Pipeline.from_json(string_or_file=file).to_json_structure()
    else:
        try:
            pipeline_to_update = pipeline_module.Pipeline.from_json(json.dumps(pipeline_to_update)).to_json_structure()
        except Exception as e:
            pass
    for step in pipeline_to_update['steps']:
        # if not updated, check and update
        primitive = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                step["primitive"]["python_path"]
            )
        )
        check_step = primitive.to_json_structure()

        # lets verify that both are updated
        id_matches = check_step["primitive"]["id"] == step["primitive"]["id"]
        if not id_matches:
            step["primitive"]["id"] = check_step["primitive"]["id"]
        version_matches = check_step["primitive"]["version"] == step["primitive"]["version"]
        if not version_matches:
            step["primitive"]["version"] = check_step["primitive"]["version"]

    return pipeline_to_update
Ejemplo n.º 16
0
def add_classifier(pipeline_description, dataset_to_dataframe_step, attributes, targets):
    lr = PrimitiveStep(primitive=SKLogisticRegression)
    lr.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER,
                    data_reference=attributes)
    lr.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER,
                    data_reference=targets)
    lr.add_output('produce')
    pipeline_description.add_step(lr)

    construct_pred = PrimitiveStep(
        primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common'))
    construct_pred.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER,
                                data_reference=pipeline_utils.int_to_step(lr.index))
    construct_pred.add_argument(name='reference', argument_type=ArgumentType.CONTAINER,
                                data_reference=dataset_to_dataframe_step)
    construct_pred.add_output('produce')
    pipeline_description.add_step(construct_pred)
    # Final Output
    pipeline_description.add_output(name='output predictions',
                                    data_reference=pipeline_utils.int_to_step(construct_pred.index))
Ejemplo n.º 17
0
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
from d3m.metadata import hyperparams
import numpy as np

# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest
#                                             extract_columns_by_semantic_types(targets)    ->            ^

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
primitive_0 = index.get_primitive(
    'd3m.primitives.data_transformation.dataset_to_dataframe.Common')
step_0 = PrimitiveStep(primitive=primitive_0)
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# # Step 1: column_parser
primitive_1 = index.get_primitive(
    'd3m.primitives.data_transformation.column_parser.Common')
step_1 = PrimitiveStep(primitive=primitive_1)
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_output('produce')
Ejemplo n.º 18
0
from d3m import index
from d3m.metadata.base import ArgumentType, Context
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
import sys

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

#d3m.primitives.data_transformation.column_parser.Common
#d3m.primitives.data_cleaning.column_type_profiler.Simon

# Step 0: dataset_to_dataframe
step_0 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: Column profiler
step_1 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.schema_discovery.profiler.Common'))
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: DISTIL/NK pca feature selection
Ejemplo n.º 19
0
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
from d3m.metadata import hyperparams

# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest
#                                             extract_columns_by_semantic_types(targets)    ->            ^

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
primitive_0 = index.get_primitive(
    'd3m.primitives.tods.data_processing.dataset_to_dataframe')
step_0 = PrimitiveStep(primitive=primitive_0)
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# # Step 1: column_parser
primitive_1 = index.get_primitive(
    'd3m.primitives.data_transformation.column_parser.Common')
step_1 = PrimitiveStep(primitive=primitive_1)
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)
Ejemplo n.º 20
0
    def __init__(
        self,
        epochs: int = 5000,
        attention_lstm: bool = True,
    ):

        pipeline_description = Pipeline()
        pipeline_description.add_input(name="inputs")

        # Ts formatter
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="inputs.0",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # DS to DF on formatted ts DS
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.dataset_to_dataframe.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # DS to DF on input DS
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.dataset_to_dataframe.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="inputs.0",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # column parser on input DF
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.column_parser.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.2.produce",
        )
        step.add_output("produce")
        step.add_hyperparameter(
            name="parse_semantic_types",
            argument_type=ArgumentType.VALUE,
            data=[
                "http://schema.org/Boolean",
                "http://schema.org/Integer",
                "http://schema.org/Float",
                "https://metadata.datadrivendiscovery.org/types/FloatVector",
            ],
        )
        pipeline_description.add_step(step)

        # parse target semantic types
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.3.produce",
        )
        step.add_hyperparameter(
            name="semantic_types",
            argument_type=ArgumentType.VALUE,
            data=[
                "https://metadata.datadrivendiscovery.org/types/Target",
            ],
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # LSTM FCN
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.time_series_classification.convolutional_neural_net.LSTM_FCN"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_argument(
            name="outputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.4.produce",
        )
        step.add_hyperparameter(
            name="epochs", argument_type=ArgumentType.VALUE, data=epochs
        )
        step.add_hyperparameter(
            name="attention_lstm", argument_type=ArgumentType.VALUE, data=attention_lstm
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # construct predictions
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.construct_predictions.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.5.produce",
        )
        step.add_argument(
            name="reference",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.2.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Final Output
        pipeline_description.add_output(
            name="output predictions", data_reference="steps.6.produce"
        )

        self.pipeline = pipeline_description
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name="inputs")

# Step 0: DS to DF on input DS
step_0 = PrimitiveStep(primitive=index.get_primitive(
    "d3m.primitives.data_transformation.dataset_to_dataframe.Common"))
step_0.add_argument(name="inputs",
                    argument_type=ArgumentType.CONTAINER,
                    data_reference="inputs.0")
step_0.add_output("produce")
pipeline_description.add_step(step_0)

# Step 1: Simple Profiler Column Role Annotation
step_1 = PrimitiveStep(primitive=index.get_primitive(
    "d3m.primitives.schema_discovery.profiler.Common"))
step_1.add_argument(
    name="inputs",
    argument_type=ArgumentType.CONTAINER,
    data_reference="steps.0.produce",
)
step_1.add_output("produce")
pipeline_description.add_step(step_1)

# Step 2: column parser on input DF
step_2 = PrimitiveStep(primitive=index.get_primitive(
    "d3m.primitives.data_transformation.column_parser.Common"))
Ejemplo n.º 22
0
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest
#                                             extract_columns_by_semantic_types(targets)    ->            ^

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
primitive_0 = index.get_primitive(
    'd3m.primitives.tods.data_processing.dataset_to_dataframe')
step_0 = PrimitiveStep(primitive=primitive_0)
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: Column Parser
primitive_1 = index.get_primitive(
    'd3m.primitives.data_transformation.column_parser.Common')
step_1 = PrimitiveStep(primitive=primitive_1)
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)
Ejemplo n.º 23
0
def generate_only():
    # Creating pipeline
    pipeline_description = Pipeline()
    pipeline_description.add_input(name='inputs')

    # Step 0: dataset_to_dataframe
    step_0 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
    step_0.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='inputs.0')
    step_0.add_output('produce')
    pipeline_description.add_step(step_0)

    # Step 1: column_parser
    step_1 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.schema_discovery.profiler.Common'))
    step_1.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.0.produce')
    step_1.add_output('produce')
    pipeline_description.add_step(step_1)

    # Step 2: column_parser
    step_2 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.data_transformation.column_parser.Common'))
    step_2.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.1.produce')
    step_2.add_output('produce')
    pipeline_description.add_step(step_2)

    # Step 3: DFS Single Table
    step_3 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization'
    ))
    step_3.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.2.produce')
    step_3.add_output('produce')
    pipeline_description.add_step(step_3)

    # Step 4: learn model
    step_4 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.regression.xgboost_gbtree.Common'))
    step_4.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.3.produce')
    step_4.add_argument(name='outputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.2.produce')
    step_4.add_output('produce')
    pipeline_description.add_step(step_4)

    # Step 5: construct output
    step_5 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.data_transformation.construct_predictions.Common'))
    step_5.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.4.produce')
    step_5.add_argument(name='reference',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.2.produce')
    step_5.add_output('produce')
    pipeline_description.add_step(step_5)

    # Final Output
    pipeline_description.add_output(name='output predictions',
                                    data_reference='steps.5.produce')

    # Generate .yml file for the pipeline
    import featuretools_ta1
    from pipeline_tests.utils import generate_pipeline
    dataset_name = 'LL1_retail_sales_total_MIN_METADATA'
    dataset_path = '/featuretools_ta1/datasets/seed_datasets_current'
    primitive_name = 'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization'
    version = featuretools_ta1.__version__
    test_name = os.path.splitext(os.path.basename(__file__))[0][5:]
    yml, pipeline_run_file = generate_pipeline(
        primitive_name=primitive_name,
        pipeline_description=pipeline_description,
        dataset_name=dataset_name,
        test_name=test_name)

    # fit-score command
    fs_cmd = 'python3 -m d3m runtime -d /featuretools_ta1/datasets/ fit-score -p {}'.format(
        yml)
    fs_cmd += ' -r {}/{}/{}_problem/problemDoc.json'.format(
        dataset_path, dataset_name, dataset_name)
    fs_cmd += ' -i {}/{}/TRAIN/dataset_TRAIN/datasetDoc.json'.format(
        dataset_path, dataset_name)
    fs_cmd += ' -t {}/{}/TEST/dataset_TEST/datasetDoc.json'.format(
        dataset_path, dataset_name)
    fs_cmd += ' -a {}/{}/SCORE/dataset_SCORE/datasetDoc.json'.format(
        dataset_path, dataset_name)
    fs_cmd += ' -O {}'.format(pipeline_run_file)

    # Run pipeline to save pipeline_run file
    os.system(fs_cmd)

    # Create and return command for running from pipeline_run file:
    pipeline_run_cmd = 'python3 -m d3m --pipelines-path /featuretools_ta1/MIT_FeatureLabs/{}/{}/pipelines/'.format(
        primitive_name, version)
    pipeline_run_cmd += ' runtime -d /featuretools_ta1/datasets/ fit-score'
    pipeline_run_cmd += ' -u {}'.format(pipeline_run_file)

    return pipeline_run_cmd
Ejemplo n.º 24
0
def build_pipeline(pipepline_info, pipepline_mapping, stdout=None):

    default_stdout = sys.stdout
    if stdout is not None:
        sys.stdout = stdout

    # Creating pipeline
    pipeline_description = Pipeline()
    pipeline_description.add_input(name='inputs')

    for primitive_info in pipepline_info:
        print(primitive_info.python_path)
        print(primitive_info.hyperparameter)
        print(primitive_info.ancestors)

        if primitive_info.python_path == 'HEAD':
            dataset_fullname = primitive_info.hyperparameter['dataset_folder']
            print(dataset_fullname)
            continue

        elif primitive_info.python_path == 'ENDING':

            ancestors = primitive_info.ancestors
            end_step_num = pipepline_mapping[ancestors['inputs']] - 1
            pipeline_description.add_output(name='output predictions', data_reference='steps.' + str(end_step_num) + '.produce')

        else:
            # print(primitive_info.python_path)
            primitive = index.get_primitive(primitive_info.python_path)
            step = PrimitiveStep(primitive=primitive)

            hyperparameters = primitive_info.hyperparameter
            ancestors = primitive_info.ancestors

            # add add_inputs
            # print(ancestors)

            if ancestors['inputs'] != 0:
                for ances_key in ancestors.keys():
                    print(ances_key, ancestors[ances_key], pipepline_mapping[ancestors[ances_key]] - 1)

                    step_num = pipepline_mapping[ancestors[ances_key]] - 1
                    step.add_argument(name=ances_key, argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(step_num) + '.produce')

            else:
                step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')

            # add add_hyperparameter
            for hyper in hyperparameters.keys():
                # print(hyper, hyperparameters[hyper], type(hyperparameters[hyper]))

                hyper_value = hyperparameters[hyper]

                step.add_hyperparameter(name=hyper, argument_type=ArgumentType.VALUE, data=hyper_value)

            step.add_output('produce')
            pipeline_description.add_step(step)

            # print('\n')

    # Output to json
    data = pipeline_description.to_json()
    with open('example_pipeline.json', 'w') as f:
        f.write(data)
        print(data)

    # yaml = pipeline_description.to_yaml()
    # with open('example_pipeline.yml', 'w') as f:
    #     f.write(yaml)
    # print(yaml)

    sys.stdout.flush()
    sys.stdout = default_stdout
Ejemplo n.º 25
0
    def _gen_pipeline(self):
        #pipeline context is just metadata, ignore for now
        pipeline = meta_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        # Step 0: DatasetToDataFrame
        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: Simple Profiler Column Role Annotation
        step_1 = meta_pipeline.PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.schema_discovery.profiler.Common"))
        step_1.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step_1.add_output("produce")
        pipeline.add_step(step_1)

        # Step 1: ColumnParser
        step_2 = meta_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Step 3: Extract Attributes
        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_3)

        # Step 4: Extract Targets
        step_4 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_4.add_output('produce')
        step_4.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        pipeline.add_step(step_4)

        #Transform attributes dataframe into an ndarray
        step_5 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_5.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.3.produce'  #inputs here are the outputs from step 3
        )
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        #Run L1LowRank
        step_6 = meta_pipeline.PrimitiveStep(
            primitive_description=RandomizedPolyPCA.metadata.query())
        step_6.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.5.produce'  #inputs here are the outputs from step 4
        )
        step_6.add_hyperparameter(name='n_components',
                                  argument_type=ArgumentType.VALUE,
                                  data=15)
        step_6.add_hyperparameter(name='degree',
                                  argument_type=ArgumentType.VALUE,
                                  data=2)
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        # convert numpy-formatted attribute data to a dataframe
        step_7 = meta_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_7.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.6.produce'  # inputs here are the outputs from step 5
        )
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes)
        step_8 = meta_pipeline.PrimitiveStep(
            primitive_description=d3m.primitives.regression.gradient_boosting.
            SKlearn.metadata.query())
        step_8.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.7.produce')
        step_8.add_argument(name='outputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_8.add_hyperparameter(name='n_estimators',
                                  argument_type=ArgumentType.VALUE,
                                  data=50000)
        step_8.add_hyperparameter(name='learning_rate',
                                  argument_type=ArgumentType.VALUE,
                                  data=0.002)
        step_8.add_hyperparameter(name='max_depth',
                                  argument_type=ArgumentType.VALUE,
                                  data=2)
        #step_7.add_hyperparameter(
        #    name = 'loss',
        #    argument_type = ArgumentType.VALUE,
        #    data = 'ls'
        #)
        step_8.add_output('produce')
        pipeline.add_step(step_8)

        #finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference
        step_9 = meta_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_9.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.8.produce'  # inputs here are the prediction column
        )
        step_9.add_argument(
            name='reference',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.1.produce'  # inputs here are the dataframed input dataset
        )
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        # Adding output step to the pipeline
        pipeline.add_output(name='output', data_reference='steps.9.produce')

        return pipeline
from d3m import index
from d3m.metadata.base import ArgumentType, Context
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
import sys

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: Denormalize primitive
step_0 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.data_transformation.denormalize.Common'))
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: dataset_to_dataframe
step_1 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_hyperparameter(name='dataframe_resource',
                          argument_type=ArgumentType.VALUE,
                          data='learningData')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: DISTIL/NK VAR primitive
Ejemplo n.º 27
0
    def __init__(self):

        pipeline_description = Pipeline()
        pipeline_description.add_input(name="inputs")

        # DS to DF on input DS
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.dataset_to_dataframe.Common"))
        step.add_argument(name="inputs",
                          argument_type=ArgumentType.CONTAINER,
                          data_reference="inputs.0")
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Simon
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_cleaning.column_type_profiler.Simon"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # column parser on input DF
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.column_parser.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # XG Boost
        step = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.classification.xgboost_gbtree.Common'))
        step.add_argument(name='inputs',
                          argument_type=ArgumentType.CONTAINER,
                          data_reference='steps.2.produce')
        step.add_argument(name='outputs',
                          argument_type=ArgumentType.CONTAINER,
                          data_reference='steps.2.produce')
        step.add_output('produce')
        step.add_hyperparameter(name='add_index_columns',
                                argument_type=ArgumentType.VALUE,
                                data=True)
        pipeline_description.add_step(step)

        # construct predictions
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.construct_predictions.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.3.produce",
        )
        step.add_argument(
            name="reference",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Final Output
        pipeline_description.add_output(name="output predictions",
                                        data_reference="steps.4.produce")

        self.pipeline = pipeline_description
Ejemplo n.º 28
0
from d3m import index
from d3m.metadata.base import ArgumentType, Context
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: Denormalize primitive
step_0 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.data_transformation.denormalize.Common'))
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: dataset_to_dataframe
step_1 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_hyperparameter(name='dataframe_resource',
                          argument_type=ArgumentType.VALUE,
                          data='learningData')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: DISTIL/NK data cleaning
step_2 = PrimitiveStep(primitive=index.get_primitive(
from d3m import index
from d3m.metadata.base import ArgumentType, Context
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
import sys

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: Denormalize primitive
step_0 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.data_transformation.denormalize.Common'))
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: DISTIL/NK Storc primitive
step_1 = PrimitiveStep(
    primitive=index.get_primitive('d3m.primitives.clustering.k_means.Sloth'))
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_hyperparameter(name='nclusters',
                          argument_type=ArgumentType.VALUE,
                          data=10)
step_1.add_hyperparameter(name='long_format',
                          argument_type=ArgumentType.VALUE,
                          data=True)
step_1.add_output('produce')
Ejemplo n.º 30
0
    def _gen_pipeline(self):
        pipeline = d3m_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        #step 0: Denormalize: join multiple tabular resource?
        # Why is there no entry point for Denormalize?

        #step 0: Dataset -> Dataframe
        step_0 = d3m_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: Simple Profiler Column Role Annotation
        step_1 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.schema_discovery.profiler.Common"))
        step_1.add_argument(
            name="inputs",
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step_1.add_output("produce")
        pipeline.add_step(step_1)

        #step 2: ColumnParser
        step_2 = d3m_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        #step 3: Imputer
        step_3 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_cleaning.imputer.SKlearn'))
        step_3.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_hyperparameter(name='use_semantic_types',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=True)
        step_3.add_output('produce')
        pipeline.add_step(step_3)

        #step 4: Extract attributes from dataset into a dedicated dataframe
        step_4 = d3m_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.3.produce')
        step_4.add_output('produce')
        step_4.add_hyperparameter(
            name='semantic_types',
            argument_type=d3m_base.ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_4)

        #step 5: Binary encoding for categorical features
        step_5 = d3m_pipeline.PrimitiveStep(
            primitive_description=BinaryEncoderPrimitive.metadata.query())
        step_5.add_hyperparameter(name='min_binary',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=2)
        step_5.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        #step 5: Extract Targets
        step_6 = d3m_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_6.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_6.add_hyperparameter(
            name='semantic_types',
            argument_type=d3m_base.ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        #step 7: transform targets dataframe into an ndarray
        step_7 = d3m_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_7.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.6.produce')
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        #step 8 : transform features dataframe into an ndarray
        step_8 = d3m_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_8.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.5.produce')
        step_8.add_output('produce')
        pipeline.add_step(step_8)
        attributes = 'steps.8.produce'
        targets = 'steps.7.produce'

        #step 9: call RFMPreconditionedGaussianKRR for regression
        #Run SparsePCA
        step_9 = d3m_pipeline.PrimitiveStep(
            primitive_description=SparsePCA.metadata.query())
        step_9.add_argument(
            name='inputs',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=attributes  #inputs here are the outputs from step 7
        )
        step_9.add_hyperparameter(name='n_components',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=4)
        step_9.add_hyperparameter(name='beta',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=1e-8)
        step_9.add_hyperparameter(name='alpha',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=1e-3)
        step_9.add_hyperparameter(name='degree',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=2)
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        #step 10: convert numpy-formatted prediction outputs to a dataframe
        step_10 = d3m_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_10.add_argument(name='inputs',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.9.produce')
        step_10.add_output('produce')
        pipeline.add_step(step_10)

        #step 11: convert numpy-formatted prediction outputs to a dataframe
        step_11 = d3m_pipeline.PrimitiveStep(
            primitive_description=HorizontalConcatPrimitive.metadata.query())
        step_11.add_argument(name='left',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.10.produce')
        step_11.add_argument(name='right',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.5.produce')
        step_11.add_output('produce')
        pipeline.add_step(step_11)

        #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes)
        step_12 = d3m_pipeline.PrimitiveStep(
            primitive_description=d3m.primitives.regression.gradient_boosting.
            SKlearn.metadata.query())
        step_12.add_argument(name='inputs',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.11.produce')
        step_12.add_argument(name='outputs',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.6.produce')
        step_12.add_hyperparameter(name='n_estimators',
                                   argument_type=d3m_base.ArgumentType.VALUE,
                                   data=10000)
        step_12.add_hyperparameter(name='learning_rate',
                                   argument_type=d3m_base.ArgumentType.VALUE,
                                   data=0.001)
        step_12.add_hyperparameter(name='max_depth',
                                   argument_type=d3m_base.ArgumentType.VALUE,
                                   data=2)
        step_12.add_output('produce')
        pipeline.add_step(step_12)

        #step 13: generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference
        step_13 = d3m_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_13.add_argument(
            name='inputs',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=
            'steps.12.produce'  #inputs here are the prediction column
        )
        step_13.add_argument(
            name='reference',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=
            'steps.1.produce'  #inputs here are the dataframe input dataset
        )
        step_13.add_output('produce')
        pipeline.add_step(step_13)

        # Final Output
        pipeline.add_output(name='output', data_reference='steps.13.produce')

        return pipeline