Example #1
0
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline()
        pipeline.add_input(name = 'inputs')

        step_0 = meta_pipeline.PrimitiveStep(primitive_description = DataConversion.metadata.query())
        step_0.add_argument(
            name = 'inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference='inputs.0'
        )

        step_0.add_output('produce')
        pipeline.add_step(step_0)

        step_1 = meta_pipeline.PrimitiveStep(primitive_description = AdjacencySpectralEmbedding.metadata.query())
        step_1.add_argument(
                name = 'inputs',
                argument_type = ArgumentType.CONTAINER,
                data_reference = 'steps.0.produce'
        )
        step_1.add_hyperparameter(
                name = 'which_elbow',
                argument_type = ArgumentType.VALUE,
                data = 1
        )
        step_1.add_hyperparameter(
                name = 'max_dimension',
                argument_type = ArgumentType.VALUE,
                data = 2
        )
        step_1.add_hyperparameter(
                name = 'use_attributes',
                argument_type = ArgumentType.VALUE,
                data = False
        )

        step_1.add_output('produce')
        pipeline.add_step(step_1)


        step_2 = meta_pipeline.PrimitiveStep(primitive_description = RankClassification.metadata.query())
        step_2.add_argument(
            name = 'inputs',
            argument_type= ArgumentType.CONTAINER,
            data_reference=  'steps.1.produce'
        )

        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Adding output step to the pipeline
        pipeline.add_output(name = 'Predictions', data_reference = 'steps.2.produce')

        return pipeline
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline()
        pipeline.add_input(name='inputs')

        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=LoadGraphs.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')

        step_0.add_output('produce')
        pipeline.add_step(step_0)

        step_1 = meta_pipeline.PrimitiveStep(
            primitive_description=LargestConnectedComponent.metadata.query())
        step_1.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.0.produce')

        step_1.add_output('produce')
        pipeline.add_step(step_1)

        step_2 = meta_pipeline.PrimitiveStep(
            primitive_description=LaplacianSpectralEmbedding.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_hyperparameter(name='max_dimension',
                                  argument_type=ArgumentType.VALUE,
                                  data=5)
        step_2.add_hyperparameter(name='use_attributes',
                                  argument_type=ArgumentType.VALUE,
                                  data=True)

        step_2.add_output('produce')
        pipeline.add_step(step_2)

        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=GaussianClustering.metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_hyperparameter(name='max_clusters',
                                  argument_type=ArgumentType.VALUE,
                                  data=10)

        step_3.add_output('produce')
        pipeline.add_step(step_3)

        # Adding output step to the pipeline
        pipeline.add_output(name='Predictions',
                            data_reference='steps.3.produce')

        return pipeline
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline()
        pipeline.add_input(name='inputs')

        # Step 0: dataset_to_dataframe
        step_0 = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: dataset_to_dataframe
        step_1 = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
        step_1.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_1.add_hyperparameter(name='dataframe_resource',
                                  argument_type=ArgumentType.VALUE,
                                  data='1')
        step_1.add_output('produce')
        pipeline.add_step(step_1)

        # Step 2: dataset_to_dataframe
        step_2 = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_2.add_hyperparameter(name='dataframe_resource',
                                  argument_type=ArgumentType.VALUE,
                                  data='2')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=EuclideanNomination.metadata.query())
        step_3.add_argument(name='inputs_1',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_3.add_argument(name='inputs_2',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_argument(name='reference',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.0.produce')

        step_3.add_output('produce')
        pipeline.add_step(step_3)

        # Adding output step to the pipeline
        pipeline.add_output(name='Predictions',
                            data_reference='steps.3.produce')

        return pipeline
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline(context = Context.TESTING)
        pipeline.add_input(name = 'inputs')

        step_0 = meta_pipeline.PrimitiveStep(primitive_description=LargestConnectedComponent.metadata.query())
        step_0.add_argument(
            name = 'inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference='inputs.0'
        )

        step_0.add_output('produce')
        pipeline.add_step(step_0)

        step_1 = meta_pipeline.PrimitiveStep(primitive_description = OutOfSampleLaplacianSpectralEmbedding.metadata.query())
        step_1.add_argument(
                name = 'inputs',
                argument_type = ArgumentType.CONTAINER,
                data_reference = 'steps.0.produce'
        )

        step_1.add_output('produce')
        pipeline.add_step(step_1)


        step_2 = meta_pipeline.PrimitiveStep(primitive_description= GaussianClustering.metadata.query())
        step_2.add_argument(
            name = 'inputs',
            argument_type= ArgumentType.CONTAINER,
            data_reference=  'steps.1.produce'
        )

        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Adding output step to the pipeline
        pipeline.add_output(name = 'Predictions', data_reference = 'steps.2.produce')

        return pipeline
Example #5
0
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline()
        pipeline.add_input(name='inputs')

        step_0 = meta_pipeline.PrimitiveStep(primitive_description=SpectralGraphClustering.metadata.query())

        step_0.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference='inputs.0'
        )
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Adding output step to the pipeline
        pipeline.add_output(name='results', data_reference='steps.0.produce')

        return pipeline
Example #6
0
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline()
        pipeline.add_input(name='inputs')

        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=SeededGraphMatching.metadata.query())
        step_0.add_argument(
            name='inputs',
            argument_type=meta_pipeline.metadata_base.ArgumentType.CONTAINER,
            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Adding output step to the pipeline
        pipeline.add_output(name='Predictions',
                            data_reference='steps.0.produce')

        return pipeline
def update_pipeline(
    pipeline_to_update, filename=None
):
    """
    This function updates the pipeline's digests and version numbers

    Parameters
    ----------
    pipeline_json_structure: the pipeline in JSON form (WITHOUT) digests.  This or the `filename` parameter is mandatory
    filename: the filename of the pipeline json, so we can read it in

    :return a pipeline with updated digests
    """
    if pipeline_to_update is None and filename is None:
        raise ValueError("No pipeline json was given")
    elif pipeline_to_update is None:
        with open(filename, "r") as file:
            # NOTE: must be a pipeline with no digests, or recent digests
            # NOTE: reading this in as straight JSON doesn't work so we have to use the pipeline_module
            pipeline_to_update = pipeline_module.Pipeline.from_json(string_or_file=file).to_json_structure()
    else:
        try:
            pipeline_to_update = pipeline_module.Pipeline.from_json(json.dumps(pipeline_to_update)).to_json_structure()
        except Exception as e:
            pass
    for step in pipeline_to_update['steps']:
        # if not updated, check and update
        primitive = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                step["primitive"]["python_path"]
            )
        )
        check_step = primitive.to_json_structure()

        # lets verify that both are updated
        id_matches = check_step["primitive"]["id"] == step["primitive"]["id"]
        if not id_matches:
            step["primitive"]["id"] = check_step["primitive"]["id"]
        version_matches = check_step["primitive"]["version"] == step["primitive"]["version"]
        if not version_matches:
            step["primitive"]["version"] = check_step["primitive"]["version"]

    return pipeline_to_update
    def _gen_pipeline(self):
        pipeline = meta_pipeline.Pipeline(context=Context.TESTING)
        pipeline.add_input(name='inputs')

        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=SeededGraphMatching.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_hyperparameter(name='reps',
                                  argument_type=ArgumentType.VALUE,
                                  data=10)
        step_0.add_hyperparameter(name='threshold',
                                  argument_type=ArgumentType.VALUE,
                                  data=0.1)
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Adding output step to the pipeline
        pipeline.add_output(name='Predictions',
                            data_reference='steps.0.produce')

        return pipeline
Example #9
0
    def _gen_pipeline(self):
        #pipeline context is just metadata, ignore for now
        pipeline = meta_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        # Step 0: DatasetToDataFrame
        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: Simple Profiler Column Role Annotation
        step_1 = meta_pipeline.PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.schema_discovery.profiler.Common"))
        step_1.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step_1.add_output("produce")
        pipeline.add_step(step_1)

        # Step 1: ColumnParser
        step_2 = meta_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Step 3: Extract Attributes
        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_3)

        # Step 4: Impute missing attributes
        step_4 = meta_pipeline.PrimitiveStep(
            primitive_description=SimpleImputerPrimitive.metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.3.produce')
        step_4.add_output('produce')
        pipeline.add_step(step_4)

        # Step 5: Convert attributes to ndarray
        step_5 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_5.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        # Step 6: Extract Targets
        step_6 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_6.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_6.add_output('produce')
        step_6.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        pipeline.add_step(step_6)

        # Step 7: Transform targets into an ndarray
        step_7 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_7.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.6.produce')
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        # Step 8: use TensorMachinesBinaryClassification
        step_8 = meta_pipeline.PrimitiveStep(
            primitive_description=TensorMachinesBinaryClassification.metadata.
            query())
        step_8.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.5.produce'  #inputs here are the attributes from step 4
        )
        step_8.add_argument(
            name='outputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.7.produce'  #outputs are the targets from step 6
        )
        step_8.add_output('produce')
        pipeline.add_step(step_8)

        #step 9: convert numpy-formatted prediction outputs to a dataframe
        step_9 = meta_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_9.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.8.produce')
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        # Step 10: generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference
        step_10 = meta_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_10.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.9.produce'  # inputs here are the prediction column
        )
        step_10.add_argument(
            name='reference',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.1.produce'  # inputs here are the dataframed input dataset
        )
        step_10.add_output('produce')
        pipeline.add_step(step_10)

        # Adding output step to the pipeline
        pipeline.add_output(name='output', data_reference='steps.10.produce')

        return pipeline
def generate_imputer_pipeline(task_type, random_id=False):
    if random_id:
        pipeline_id = str(uuid.uuid4())
    elif task_type == 'classification':
        pipeline_id = '168d3fbf-a3fe-456a-93a3-d2720ef8cb42'
    elif task_type == 'regression':
        pipeline_id = 'faeb3eb9-648f-4059-b067-791ebff47bc4'
    else:
        raise ValueError('Invalid task_type: {}'.format(task_type))

    d3m_index.register_primitive(
        RandomSamplingImputer.metadata.query()['python_path'],
        RandomSamplingImputer
    )

    pipeline = pipeline_module.Pipeline(pipeline_id)
    pipeline.add_input(name='inputs')
    step_counter = 0


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference='inputs.0'
    )
    step.add_output('produce')
    pipeline.add_step(step)
    raw_data_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.schema_discovery.profiler.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    profiled_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.column_parser.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=profiled_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    parsed_data_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'
        )
    )
    step.add_hyperparameter(
        name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE,
        data=['https://metadata.datadrivendiscovery.org/types/Attribute']
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=parsed_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    raw_attributes_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'
        )
    )
    step.add_hyperparameter(
        name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE,
        data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=parsed_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    true_targets_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_preprocessing.random_sampling_imputer.BYU'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_attributes_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    imputed_attributes_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    if task_type == 'regression':
        step = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                'd3m.primitives.regression.random_forest.SKlearn'
            )
        )
        step.add_hyperparameter(
            name='use_semantic_types',
            argument_type=metadata_base.ArgumentType.VALUE, data=True
        )
        step.add_argument(
            name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=imputed_attributes_data_reference
        )
        step.add_argument(
            name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=true_targets_data_reference
        )
        step.add_output('produce')
        pipeline.add_step(step)
        step_counter += 1


    elif task_type == 'classification':
        step = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                'd3m.primitives.classification.random_forest.SKlearn'
            )
        )
        step.add_hyperparameter(
            name='use_semantic_types',
            argument_type=metadata_base.ArgumentType.VALUE, data=True
        )
        step.add_argument(
            name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=imputed_attributes_data_reference
        )
        step.add_argument(
            name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=true_targets_data_reference
        )
        step.add_output('produce')
        pipeline.add_step(step)
        step_counter += 1

    else:
        raise ValueError('Invalid task_type: {}'.format(task_type))


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.construct_predictions.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference='steps.{}.produce'.format(step_counter - 1)
    )
    step.add_argument(
        name='reference', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    step_counter += 1


    pipeline.add_output(
        name='predictions',
        data_reference='steps.{}.produce'.format(step_counter - 1)
    )

    return pipeline
def generate_profiler_pipeline(task_type, random_id=False):
    if random_id:
        pipeline_id = str(uuid.uuid4())
    elif task_type == 'classification':
        pipeline_id = 'f4ebb9c9-ef15-491d-9a39-595c20f3e78e'
    elif task_type == 'regression':
        pipeline_id = '9f5f6042-6582-494a-bc4b-92c7797a6614'
    else:
        raise ValueError('Invalid task_type: {}'.format(task_type))

    d3m_index.register_primitive(
        SemanticProfilerPrimitive.metadata.query()['python_path'],
        SemanticProfilerPrimitive
    )

    pipeline = pipeline_module.Pipeline(pipeline_id)
    pipeline.add_input(name='inputs')
    step_counter = 0


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference='inputs.0'
    )
    step.add_output('produce')
    pipeline.add_step(step)
    raw_data_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.schema_discovery.profiler.BYU'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    profiled_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.column_parser.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=profiled_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    parsed_data_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'
        )
    )
    step.add_hyperparameter(
        name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE,
        data=['https://metadata.datadrivendiscovery.org/types/Attribute']
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=parsed_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    raw_attributes_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'
        )
    )
    step.add_hyperparameter(
        name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE,
        data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=parsed_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    true_targets_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_cleaning.imputer.SKlearn'
        )
    )
    step.add_hyperparameter(
        name='use_semantic_types',
        argument_type=metadata_base.ArgumentType.VALUE, data=True
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_attributes_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    imputed_attributes_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    if task_type == 'regression':
        step = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                'd3m.primitives.regression.random_forest.SKlearn'
            )
        )
        step.add_hyperparameter(
            name='use_semantic_types',
            argument_type=metadata_base.ArgumentType.VALUE, data=True
        )
        step.add_argument(
            name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=imputed_attributes_data_reference
        )
        step.add_argument(
            name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=true_targets_data_reference
        )
        step.add_output('produce')
        pipeline.add_step(step)
        step_counter += 1


    elif task_type == 'classification':
        step = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                'd3m.primitives.classification.random_forest.SKlearn'
            )
        )
        step.add_hyperparameter(
            name='use_semantic_types',
            argument_type=metadata_base.ArgumentType.VALUE, data=True
        )
        step.add_argument(
            name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=imputed_attributes_data_reference
        )
        step.add_argument(
            name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=true_targets_data_reference
        )
        step.add_output('produce')
        pipeline.add_step(step)
        step_counter += 1

    else:
        raise ValueError('Invalid task_type: {}'.format(task_type))


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.construct_predictions.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference='steps.{}.produce'.format(step_counter - 1)
    )
    step.add_argument(
        name='reference', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    step_counter += 1


    pipeline.add_output(
        name='predictions',
        data_reference='steps.{}.produce'.format(step_counter - 1)
    )

    return pipeline
    def _gen_pipeline():
        #pipeline context is just metadata, ignore for now
        pipeline = meta_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        # DatasetToDataFrame
        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Profiler to infer semantic types
        step_1 = meta_pipeline.PrimitiveStep(
            primitive_description=SimpleProfilerPrimitive.metadata.query())
        step_1.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.0.produce')
        step_1.add_output('produce')
        pipeline.add_step(step_1)

        # ColumnParser
        step_2 = meta_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Extract Attributes
        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        step_3.add_hyperparameter(name='exclude_columns',
                                  argument_type=ArgumentType.VALUE,
                                  data=[0, 1, 6, 7])
        pipeline.add_step(step_3)

        # Impute missing data and nans
        step_4 = meta_pipeline.PrimitiveStep(
            primitive_description=SKImputer.metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.3.produce')
        step_4.add_output('produce')
        step_4.add_hyperparameter(name='use_semantic_types',
                                  argument_type=ArgumentType.VALUE,
                                  data=True)
        step_4.add_hyperparameter(name='return_result',
                                  argument_type=ArgumentType.VALUE,
                                  data='replace')
        pipeline.add_step(step_4)

        # Extract Targets
        step_5 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_5.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_5.add_output('produce')
        step_5.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        pipeline.add_step(step_5)

        # Transform attributes dataframe into an ndarray
        step_6 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_6.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        # Run GRASTA
        step_7 = meta_pipeline.PrimitiveStep(
            primitive_description=GRASTA.metadata.query())
        step_7.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.6.produce')
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        # Convert numpy-formatted attribute data to a dataframe
        step_8 = meta_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_8.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.7.produce')
        step_8.add_output('produce')
        pipeline.add_step(step_8)

        # Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes)
        step_9 = meta_pipeline.PrimitiveStep(
            primitive_description=SKLinearSVR.metadata.query())
        step_9.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.8.produce')
        step_9.add_argument(name='outputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.5.produce')
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        # Finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference
        step_10 = meta_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_10.add_argument(name='inputs',
                             argument_type=ArgumentType.CONTAINER,
                             data_reference='steps.9.produce')
        step_10.add_argument(name='reference',
                             argument_type=ArgumentType.CONTAINER,
                             data_reference='steps.0.produce')
        step_10.add_output('produce')
        pipeline.add_step(step_10)

        # Adding output step to the pipeline
        pipeline.add_output(name='output', data_reference='steps.10.produce')

        return pipeline
def generate_metafeature_pipeline(task_type, random_id=False):
    if random_id:
        pipeline_id = str(uuid.uuid4())
    elif task_type == 'classification':
        pipeline_id = 'baa68a80-3a7d-472d-8d4f-54918cc1bd8f'
    elif task_type == 'regression':
        pipeline_id = '28e413f9-6085-4e34-b2c2-a5182a322a4b'
    else:
        raise ValueError('Invalid task_type: {}'.format(task_type))

    d3m_index.register_primitive(
        MetafeatureExtractor.metadata.query()['python_path'],
        MetafeatureExtractor
    )

    pipeline = pipeline_module.Pipeline(pipeline_id)
    pipeline.add_input(name='inputs')
    step_counter = 0


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.dataset_to_dataframe.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference='inputs.0'
    )
    step.add_output('produce')
    pipeline.add_step(step)
    raw_data_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.schema_discovery.profiler.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    profiled_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.column_parser.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=profiled_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    parsed_data_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.metalearning.metafeature_extractor.BYU'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=parsed_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    imputed_data_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'
        )
    )
    step.add_hyperparameter(
        name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE,
        data=['https://metadata.datadrivendiscovery.org/types/Attribute']
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=parsed_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    raw_attributes_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'
        )
    )
    step.add_hyperparameter(
        name='semantic_types', argument_type=metadata_base.ArgumentType.VALUE,
        data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=parsed_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    true_targets_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_cleaning.imputer.SKlearn'
        )
    )
    step.add_hyperparameter(
        name='use_semantic_types',
        argument_type=metadata_base.ArgumentType.VALUE, data=True
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_attributes_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    imputed_attributes_data_reference = 'steps.{}.produce'.format(step_counter)
    step_counter += 1


    if task_type == 'regression':
        step = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                'd3m.primitives.regression.random_forest.SKlearn'
            )
        )
        step.add_hyperparameter(
            name='use_semantic_types',
            argument_type=metadata_base.ArgumentType.VALUE, data=True
        )
        step.add_argument(
            name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=imputed_attributes_data_reference
        )
        step.add_argument(
            name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=true_targets_data_reference
        )
        step.add_output('produce')
        pipeline.add_step(step)
        step_counter += 1


    elif task_type == 'classification':
        step = pipeline_module.PrimitiveStep(
            primitive=d3m_index.get_primitive(
                'd3m.primitives.classification.random_forest.SKlearn'
            )
        )
        step.add_hyperparameter(
            name='use_semantic_types',
            argument_type=metadata_base.ArgumentType.VALUE, data=True
        )
        step.add_argument(
            name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=imputed_attributes_data_reference
        )
        step.add_argument(
            name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER,
            data_reference=true_targets_data_reference
        )
        step.add_output('produce')
        pipeline.add_step(step)
        step_counter += 1

    else:
        raise ValueError('Invalid task_type: {}'.format(task_type))


    step = pipeline_module.PrimitiveStep(
        primitive=d3m_index.get_primitive(
            'd3m.primitives.data_transformation.construct_predictions.Common'
        )
    )
    step.add_argument(
        name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference='steps.{}.produce'.format(step_counter - 1)
    )
    step.add_argument(
        name='reference', argument_type=metadata_base.ArgumentType.CONTAINER,
        data_reference=raw_data_data_reference
    )
    step.add_output('produce')
    pipeline.add_step(step)
    step_counter += 1


    pipeline.add_output(
        name='predictions',
        data_reference='steps.{}.produce'.format(step_counter - 1)
    )

    return pipeline
big_pipeline.add_step(step_0)
step_0_output = step_0.add_output('output')

step_1 = pipeline_module.FittedPipelineStep(fitted1.id, fitted1)
step_1.add_input(pipeline_input)
big_pipeline.add_step(step_1)
step_1_output = step_1.add_output('output')

step_2 = pipeline_module.FittedPipelineStep(fitted2.id, fitted2)
step_2.add_input(pipeline_input)
big_pipeline.add_step(step_2)
step_2_output = step_2.add_output('output')

concat_step = pipeline_module.PrimitiveStep({
    "python_path": "d3m.primitives.dsbox.HorizontalConcat",
    "id": "dsbox-horizontal-concat",
    "version": "1.3.0",
    "name": "DSBox horizontal concat"
})
concat_step.add_argument(name='inputs1',
                         argument_type=pipeline_module.ArgumentType.CONTAINER,
                         data_reference=step_0_output)
concat_step.add_argument(name='inputs2',
                         argument_type=pipeline_module.ArgumentType.CONTAINER,
                         data_reference=step_1_output)
# concat_step.add_argument(name='inputs2', argument_type=pipeline_module.ArgumentType.CONTAINER, data_reference=step_2_output)
big_pipeline.add_step(concat_step)
concat_step_output = concat_step.add_output('produce')

# concat_step = pipeline_module.PrimitiveStep({
#     "python_path": "d3m.primitives.dsbox.VerticalConcat",
#     "id": "dsbox-vertical-concat",
Example #15
0
    def _gen_pipeline():
        #pipeline context is just metadata, ignore for now
        pipeline = meta_pipeline.Pipeline()
        # define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        # step 0: Dataset -> Dataframe
        step_0 = meta_pipeline.PrimitiveStep(primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Profiler to infer semantic types
        step_1 = meta_pipeline.PrimitiveStep(primitive_description=SimpleProfilerPrimitive.metadata.query())
        step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
        step_1.add_output('produce')
        pipeline.add_step(step_1)

        # Dataframe -> Column Parsing
        step_2 = meta_pipeline.PrimitiveStep(primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Column -> Column Extract attributes
        step_3 = meta_pipeline.PrimitiveStep(primitive_description=ExtractColumnsBySemanticTypesPrimitive.metadata.query())
        step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_3)

        # Attribute Dataframe -> NDArray
        step_4 = meta_pipeline.PrimitiveStep(primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce')
        step_4.add_output('produce')
        pipeline.add_step(step_4)

        # NDARRAY -> Cluster
        step_5 = meta_pipeline.PrimitiveStep(primitive_description=SSC_CVX.metadata.query())
        step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
        step_5.add_hyperparameter(name='n_clusters', argument_type=ArgumentType.VALUE, data=100)
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        # Cluster -> Dataframe
        step_6 = meta_pipeline.PrimitiveStep(primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        # Dataframe -> combine with original
        step_7 = meta_pipeline.PrimitiveStep(primitive_description=ConstructPredictionsPrimitive.metadata.query())
        step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
        step_7.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        # Final Output
        pipeline.add_output(name='output', data_reference='steps.7.produce')

        return pipeline
def preprocessing_pipeline():
    preprocessing_pipeline = pipeline_module.Pipeline(
        'big', context=pipeline_module.PipelineContext.TESTING)
    initial_input = preprocessing_pipeline.add_input(name="inputs")
    denormalize_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.dsbox.Denormalize").metadata.query()))
    denormalize_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=initial_input)
    preprocessing_pipeline.add_step(denormalize_step)
    denormalize_step_output = denormalize_step.add_output('produce')
    to_dataframe_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.datasets.DatasetToDataFrame").metadata.query())
    )
    to_dataframe_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=denormalize_step_output)
    preprocessing_pipeline.add_step(to_dataframe_step)
    to_dataframe_step_output = to_dataframe_step.add_output("produce")
    extract_attribute_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.data.ExtractColumnsBySemanticTypes").metadata.
            query()))
    extract_attribute_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=to_dataframe_step_output)
    preprocessing_pipeline.add_step(extract_attribute_step)
    extract_attribute_step_output = extract_attribute_step.add_output(
        "produce")
    extract_attribute_step.add_hyperparameter(
        name='semantic_types',
        argument_type=pipeline_module.ArgumentType.VALUE,
        data=(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
            'https://metadata.datadrivendiscovery.org/types/Attribute',
        ))
    profiler_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.dsbox.Profiler").metadata.query()))
    profiler_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=extract_attribute_step_output)
    preprocessing_pipeline.add_step(profiler_step)
    profiler_step_output = profiler_step.add_output("produce")
    clean_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.dsbox.CleaningFeaturizer").metadata.query()))
    clean_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=profiler_step_output)
    preprocessing_pipeline.add_step(clean_step)
    clean_step_output = clean_step.add_output("produce")
    corex_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.dsbox.CorexText").metadata.query()))
    corex_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=clean_step_output)
    preprocessing_pipeline.add_step(corex_step)
    corex_step_output = corex_step.add_output("produce")
    encoder_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.dsbox.Encoder").metadata.query()))
    encoder_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=corex_step_output)
    preprocessing_pipeline.add_step(encoder_step)
    encoder_step_output = encoder_step.add_output("produce")
    impute_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.dsbox.MeanImputation").metadata.query()))
    impute_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=encoder_step_output)
    preprocessing_pipeline.add_step(impute_step)
    impute_step_output = impute_step.add_output("produce")
    scalar_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.dsbox.IQRScaler").metadata.query()))
    scalar_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=impute_step_output)
    preprocessing_pipeline.add_step(scalar_step)
    scalar_step_output = scalar_step.add_output("produce")
    extract_target_step = pipeline_module.PrimitiveStep(
        dict(
            d3m_index.get_primitive(
                "d3m.primitives.data.ExtractColumnsBySemanticTypes").metadata.
            query()))
    extract_target_step.add_argument(
        name="inputs",
        argument_type=pipeline_module.ArgumentType.CONTAINER,
        data_reference=to_dataframe_step_output)
    preprocessing_pipeline.add_step(extract_target_step)
    extract_target_step_output = extract_target_step.add_output("produce")
    extract_target_step.add_hyperparameter(
        name='semantic_types',
        argument_type=pipeline_module.ArgumentType.VALUE,
        data=('https://metadata.datadrivendiscovery.org/types/Target',
              'https://metadata.datadrivendiscovery.org/types/TrueTarget'))
    # preprocessing_pipeline.add_output(name="produce", data_reference=scalar_step_output)
    return preprocessing_pipeline, scalar_step_output, initial_input, extract_target_step_output
Example #17
0
    def _gen_pipeline(self):
        #pipeline context is just metadata, ignore for now
        pipeline = meta_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        # Step 0: DatasetToDataFrame
        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: ColumnParser
        step_1 = meta_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_1.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.0.produce')
        step_1.add_output('produce')
        pipeline.add_step(step_1)

        # Step 2: Extract Attributes
        step_2 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        step_2.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_2)

        # Step 3: Extract Targets
        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        pipeline.add_step(step_3)

        #Transform attributes dataframe into an ndarray
        step_4 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_4.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.2.produce'  #inputs here are the outputs from step 3
        )
        step_4.add_output('produce')
        pipeline.add_step(step_4)

        #Run L1LowRank
        step_5 = meta_pipeline.PrimitiveStep(
            primitive_description=L1LowRank.metadata.query())
        step_5.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.4.produce'  #inputs here are the outputs from step 4
        )
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        # convert numpy-formatted attribute data to a dataframe
        step_6 = meta_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_6.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.5.produce'  # inputs here are the outputs from step 5
        )
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        #Ridge Regression on low-rank data (inputs and outputs for sklearns are both dataframes)
        step_7 = meta_pipeline.PrimitiveStep(
            primitive_description=SKRidge.metadata.query())
        step_7.add_hyperparameter(name='max_iter',
                                  argument_type=ArgumentType.VALUE,
                                  data=10000)
        step_7.add_hyperparameter(name='tol',
                                  argument_type=ArgumentType.VALUE,
                                  data=0.01)
        step_7.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.6.produce')
        step_7.add_argument(name='outputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.3.produce')
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        #finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference
        step_8 = meta_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_8.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.7.produce'  # inputs here are the prediction column
        )
        step_8.add_argument(
            name='reference',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.0.produce'  # inputs here are the dataframed input dataset
        )
        step_8.add_output('produce')
        pipeline.add_step(step_8)

        # Adding output step to the pipeline
        pipeline.add_output(name='output', data_reference='steps.8.produce')

        return pipeline
    def _gen_pipeline():
        #pipeline context is just metadata, ignore for now
        pipeline = meta_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        # Dataset -> Dataframe
        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Profiler to infer semantic types
        step_1 = meta_pipeline.PrimitiveStep(
            primitive_description=SimpleProfilerPrimitive.metadata.query())
        step_1.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.0.produce')
        step_1.add_output('produce')
        pipeline.add_step(step_1)

        # ColumnParser
        step_2 = meta_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Extract Attributes
        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_3)

        # Extract Targets
        step_4 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_4.add_output('produce')
        step_4.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        pipeline.add_step(step_4)

        # Impute missing data and nans
        step_5 = meta_pipeline.PrimitiveStep(
            primitive_description=SKImputer.metadata.query())
        step_5.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.3.produce')
        step_5.add_output('produce')
        step_5.add_hyperparameter(name='use_semantic_types',
                                  argument_type=ArgumentType.VALUE,
                                  data=True)
        step_5.add_hyperparameter(name='return_result',
                                  argument_type=ArgumentType.VALUE,
                                  data='replace')
        pipeline.add_step(step_5)

        # Transform attributes dataframe into an ndarray
        step_6 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_6.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.5.produce')
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        # Transform targets dataframe into an ndarray
        step_7 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_7.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        # OWLRegression
        step_8 = meta_pipeline.PrimitiveStep(
            primitive_description=OWLRegression.metadata.query())
        step_8.add_hyperparameter(name='normalize',
                                  argument_type=ArgumentType.VALUE,
                                  data=True)
        step_8.add_hyperparameter(name='learning_rate',
                                  argument_type=ArgumentType.VALUE,
                                  data=2e-1)
        step_8.add_hyperparameter(name='tol',
                                  argument_type=ArgumentType.VALUE,
                                  data=1e-3)
        step_8.add_hyperparameter(name='weight_max_val',
                                  argument_type=ArgumentType.VALUE,
                                  data=175)
        step_8.add_hyperparameter(name='weight_max_off',
                                  argument_type=ArgumentType.VALUE,
                                  data=0)
        step_8.add_hyperparameter(name='weight_min_val',
                                  argument_type=ArgumentType.VALUE,
                                  data=0)
        step_8.add_hyperparameter(name='weight_min_off',
                                  argument_type=ArgumentType.VALUE,
                                  data=13)
        step_8.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.6.produce')
        step_8.add_argument(name='outputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.7.produce')
        step_8.add_output('produce')
        pipeline.add_step(step_8)

        # Convert numpy-formatted prediction outputs to a dataframe
        step_9 = meta_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_9.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.8.produce')
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        # Generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference
        step_10 = meta_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_10.add_argument(name='inputs',
                             argument_type=ArgumentType.CONTAINER,
                             data_reference='steps.9.produce')
        step_10.add_argument(name='reference',
                             argument_type=ArgumentType.CONTAINER,
                             data_reference='steps.0.produce')
        step_10.add_output('produce')
        pipeline.add_step(step_10)

        # Final Output
        pipeline.add_output(name='output', data_reference='steps.10.produce')

        return pipeline
Example #19
0
    def _gen_pipeline(self):
        pipeline = d3m_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        #step 0: Denormalize: join multiple tabular resource?
        # Why is there no entry point for Denormalize?

        #step 0: Dataset -> Dataframe
        step_0 = d3m_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: Simple Profiler Column Role Annotation
        step_1 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.schema_discovery.profiler.Common"))
        step_1.add_argument(
            name="inputs",
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step_1.add_output("produce")
        pipeline.add_step(step_1)

        #step 1: ColumnParser
        step_2 = d3m_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        #step 3: Extract attributes from dataset into a dedicated dataframe
        step_3 = d3m_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(
            name='semantic_types',
            argument_type=d3m_base.ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_3)

        #step 4: Extract Targets
        step_4 = d3m_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_4.add_output('produce')
        step_4.add_hyperparameter(
            name='semantic_types',
            argument_type=d3m_base.ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        pipeline.add_step(step_4)

        #step 5: transform targets dataframe into an ndarray
        step_5 = d3m_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_5.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        #step 6 : transform features dataframe into an ndarray
        step_6 = d3m_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_6.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.3.produce')
        step_6.add_output('produce')
        pipeline.add_step(step_6)
        attributes = 'steps.6.produce'
        targets = 'steps.5.produce'

        #step 7: call RFMPreconditionedGaussianKRR for regression
        step_7 = d3m_pipeline.PrimitiveStep(
            primitive_description=RFMPreconditionedGaussianKRR.metadata.query(
            ))
        step_7.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference=attributes)
        step_7.add_argument(name='outputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference=targets)
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        #step 8: convert numpy-formatted prediction outputs to a dataframe
        step_8 = d3m_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_8.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.7.produce')
        step_8.add_output('produce')
        pipeline.add_step(step_8)

        #step 9: generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference
        step_9 = d3m_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_9.add_argument(
            name='inputs',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=
            'steps.8.produce'  #inputs here are the prediction column
        )
        step_9.add_argument(
            name='reference',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=
            'steps.1.produce'  #inputs here are the dataframe input dataset
        )
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        # Final Output
        pipeline.add_output(name='output', data_reference='steps.9.produce')

        return pipeline
Example #20
0
    def generate_ensemble_pipeline(self):
        if not self.pids:
            raise ValueError(
                "No candidate pipeline ids found, unable to generate the ensemble pipeline."
            )
        elif len(self.pids) == 1:
            raise ValueError(
                "Only 1 candidate pipeline id found, unable to generate the ensemble pipeline."
            )
        step_outputs = []
        self.big_pipeline, pipeline_output, pipeline_input, target = self.preprocessing_pipeline(
        )
        for each_pid in self.pids:
            each_dsbox_fitted = FittedPipeline.load(self.pipeline_files_dir,
                                                    each_pid)
            each_runtime = each_dsbox_fitted.runtime
            each_fitted = runtime_module.FittedPipeline(
                each_pid,
                each_runtime,
                context=pipeline_module.PipelineContext.TESTING)
            each_step = pipeline_module.FittedPipelineStep(
                each_fitted.id, each_fitted)
            each_step.add_input(pipeline_input)
            self.big_pipeline.add_step(each_step)
            step_outputs.append(each_step.add_output('output'))

        concat_step = pipeline_module.PrimitiveStep({
            "python_path":
            "d3m.primitives.data_preprocessing.horizontal_concat.DSBOX",
            "id":
            "dsbox-horizontal-concat",
            "version":
            "1.3.0",
            "name":
            "DSBox horizontal concat"
        })
        for i in range(len(self.pids) - 1):
            each_concact_step = copy.deepcopy(concat_step)
            if i == 0:
                each_concact_step.add_argument(
                    name='inputs1',
                    argument_type=pipeline_module.ArgumentType.CONTAINER,
                    data_reference=step_outputs[i])
            else:
                each_concact_step.add_argument(
                    name='inputs1',
                    argument_type=pipeline_module.ArgumentType.CONTAINER,
                    data_reference=concat_step_output)
            each_concact_step.add_argument(
                name='inputs2',
                argument_type=pipeline_module.ArgumentType.CONTAINER,
                data_reference=step_outputs[i + 1])
            each_concact_step.add_hyperparameter(
                name="column_name",
                argument_type=pipeline_module.ArgumentType.VALUE,
                data=i)

            self.big_pipeline.add_step(each_concact_step)
            # update concat_step_output
            concat_step_output = each_concact_step.add_output('produce')

        encode_res_step = pipeline_module.PrimitiveStep(
            dict(
                d3m_index.get_primitive(
                    "d3m.primitives.data_preprocessing.encoder.DSBOX").
                metadata.query()))
        encode_res_step.add_argument(
            name="inputs",
            argument_type=pipeline_module.ArgumentType.CONTAINER,
            data_reference=concat_step_output)
        self.big_pipeline.add_step(encode_res_step)
        encode_res_step_output = encode_res_step.add_output("produce")

        concat_step1 = pipeline_module.PrimitiveStep(
            dict(
                d3m_index.get_primitive(
                    "d3m.primitives.data.HorizontalConcat").metadata.query()))
        concat_step1.add_argument(
            name="left",
            argument_type=pipeline_module.ArgumentType.CONTAINER,
            data_reference=encode_res_step_output)
        concat_step1.add_argument(
            name="right",
            argument_type=pipeline_module.ArgumentType.CONTAINER,
            data_reference=pipeline_output)
        concat_step1.add_hyperparameter(
            name="use_index",
            argument_type=pipeline_module.ArgumentType.VALUE,
            data=False)
        self.big_pipeline.add_step(concat_step1)
        concat_output1 = concat_step1.add_output("produce")

        model_step = pipeline_module.PrimitiveStep(
            dict(
                d3m_index.get_primitive(
                    self.final_step_primitive).metadata.query()))
        model_step.add_argument(
            name="inputs",
            argument_type=pipeline_module.ArgumentType.CONTAINER,
            data_reference=concat_output1)
        model_step.add_argument(
            name="outputs",
            argument_type=pipeline_module.ArgumentType.CONTAINER,
            data_reference=target)
        self.big_pipeline.add_step(model_step)
        big_output = model_step.add_output("produce")
        final_output = self.big_pipeline.add_output(name="final",
                                                    data_reference=big_output)
        self._logger.info("Ensemble pipeline created successfully")
Example #21
0
    def _gen_pipeline(self):
        #pipeline context is just metadata, ignore for now
        pipeline = meta_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        # Step 0: DatasetToDataFrame
        step_0 = meta_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: Simple Profiler Column Role Annotation
        step_1 = meta_pipeline.PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.schema_discovery.profiler.Common"))
        step_1.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step_1.add_output("produce")
        pipeline.add_step(step_1)

        # Step 1: ColumnParser
        step_2 = meta_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        # Step 3: Extract Attributes
        step_3 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_3.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_output('produce')
        step_3.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_3)

        # Step 4: Extract Targets
        step_4 = meta_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_4.add_output('produce')
        step_4.add_hyperparameter(
            name='semantic_types',
            argument_type=ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        pipeline.add_step(step_4)

        #Transform attributes dataframe into an ndarray
        step_5 = meta_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_5.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.3.produce'  #inputs here are the outputs from step 3
        )
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        #Run L1LowRank
        step_6 = meta_pipeline.PrimitiveStep(
            primitive_description=RandomizedPolyPCA.metadata.query())
        step_6.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.5.produce'  #inputs here are the outputs from step 4
        )
        step_6.add_hyperparameter(name='n_components',
                                  argument_type=ArgumentType.VALUE,
                                  data=15)
        step_6.add_hyperparameter(name='degree',
                                  argument_type=ArgumentType.VALUE,
                                  data=2)
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        # convert numpy-formatted attribute data to a dataframe
        step_7 = meta_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_7.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.6.produce'  # inputs here are the outputs from step 5
        )
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes)
        step_8 = meta_pipeline.PrimitiveStep(
            primitive_description=d3m.primitives.regression.gradient_boosting.
            SKlearn.metadata.query())
        step_8.add_argument(name='inputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.7.produce')
        step_8.add_argument(name='outputs',
                            argument_type=ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_8.add_hyperparameter(name='n_estimators',
                                  argument_type=ArgumentType.VALUE,
                                  data=50000)
        step_8.add_hyperparameter(name='learning_rate',
                                  argument_type=ArgumentType.VALUE,
                                  data=0.002)
        step_8.add_hyperparameter(name='max_depth',
                                  argument_type=ArgumentType.VALUE,
                                  data=2)
        #step_7.add_hyperparameter(
        #    name = 'loss',
        #    argument_type = ArgumentType.VALUE,
        #    data = 'ls'
        #)
        step_8.add_output('produce')
        pipeline.add_step(step_8)

        #finally generate a properly-formatted output dataframe from the prediction outputs using the input dataframe as a reference
        step_9 = meta_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_9.add_argument(
            name='inputs',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.8.produce'  # inputs here are the prediction column
        )
        step_9.add_argument(
            name='reference',
            argument_type=ArgumentType.CONTAINER,
            data_reference=
            'steps.1.produce'  # inputs here are the dataframed input dataset
        )
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        # Adding output step to the pipeline
        pipeline.add_output(name='output', data_reference='steps.9.produce')

        return pipeline
Example #22
0
 def preprocessing_pipeline(self):
     preprocessing_pipeline = pipeline_module.Pipeline(
         'big', context=pipeline_module.PipelineContext.TESTING)
     initial_input = preprocessing_pipeline.add_input(name="inputs")
     denormalize_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.normalization.denormalize.DSBOX").metadata.
             query()))
     denormalize_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=initial_input)
     preprocessing_pipeline.add_step(denormalize_step)
     denormalize_step_output = denormalize_step.add_output('produce')
     to_dataframe_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.data_transformation.dataset_to_dataframe.Common"
             ).metadata.query()))
     to_dataframe_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=denormalize_step_output)
     preprocessing_pipeline.add_step(to_dataframe_step)
     to_dataframe_step_output = to_dataframe_step.add_output("produce")
     extract_attribute_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon"
             ).metadata.query()))
     extract_attribute_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=to_dataframe_step_output)
     preprocessing_pipeline.add_step(extract_attribute_step)
     extract_attribute_step_output = extract_attribute_step.add_output(
         "produce")
     extract_attribute_step.add_hyperparameter(
         name='semantic_types',
         argument_type=pipeline_module.ArgumentType.VALUE,
         data=(
             'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
             'https://metadata.datadrivendiscovery.org/types/Attribute',
         ))
     profiler_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.schema_discovery.profiler.DSBOX").metadata.
             query()))
     profiler_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=extract_attribute_step_output)
     preprocessing_pipeline.add_step(profiler_step)
     profiler_step_output = profiler_step.add_output("produce")
     clean_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.data_cleaning.cleaning_featurizer.DSBOX").
             metadata.query()))
     clean_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=profiler_step_output)
     preprocessing_pipeline.add_step(clean_step)
     clean_step_output = clean_step.add_output("produce")
     corex_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.feature_construction.corex_text.CorexText"
             ).metadata.query()))
     corex_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=clean_step_output)
     preprocessing_pipeline.add_step(corex_step)
     corex_step_output = corex_step.add_output("produce")
     encoder_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.data_preprocessing.encoder.DSBOX").
             metadata.query()))
     encoder_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=corex_step_output)
     preprocessing_pipeline.add_step(encoder_step)
     encoder_step_output = encoder_step.add_output("produce")
     impute_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.data_preprocessing.mean_imputation.DSBOX").
             metadata.query()))
     impute_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=encoder_step_output)
     preprocessing_pipeline.add_step(impute_step)
     impute_step_output = impute_step.add_output("produce")
     scalar_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.normalization.iqr_scaler.DSBOX").metadata.
             query()))
     scalar_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=impute_step_output)
     preprocessing_pipeline.add_step(scalar_step)
     scalar_step_output = scalar_step.add_output("produce")
     extract_target_step = pipeline_module.PrimitiveStep(
         dict(
             d3m_index.get_primitive(
                 "d3m.primitives.data_transformation.extract_columns_by_semantic_types.DataFrameCommon"
             ).metadata.query()))
     extract_target_step.add_argument(
         name="inputs",
         argument_type=pipeline_module.ArgumentType.CONTAINER,
         data_reference=to_dataframe_step_output)
     preprocessing_pipeline.add_step(extract_target_step)
     extract_target_step_output = extract_target_step.add_output("produce")
     extract_target_step.add_hyperparameter(
         name='semantic_types',
         argument_type=pipeline_module.ArgumentType.VALUE,
         data=('https://metadata.datadrivendiscovery.org/types/Target',
               'https://metadata.datadrivendiscovery.org/types/TrueTarget'))
     # preprocessing_pipeline.add_output(name="produce", data_reference=scalar_step_output)
     return preprocessing_pipeline, scalar_step_output, initial_input, extract_target_step_output
Example #23
0
    def _gen_pipeline(self):
        pipeline = d3m_pipeline.Pipeline()
        #define inputs.  This will be read in automatically as a Dataset object.
        pipeline.add_input(name='inputs')

        #step 0: Denormalize: join multiple tabular resource?
        # Why is there no entry point for Denormalize?

        #step 0: Dataset -> Dataframe
        step_0 = d3m_pipeline.PrimitiveStep(
            primitive_description=DatasetToDataFramePrimitive.metadata.query())
        step_0.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='inputs.0')
        step_0.add_output('produce')
        pipeline.add_step(step_0)

        # Step 1: Simple Profiler Column Role Annotation
        step_1 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.schema_discovery.profiler.Common"))
        step_1.add_argument(
            name="inputs",
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step_1.add_output("produce")
        pipeline.add_step(step_1)

        #step 2: ColumnParser
        step_2 = d3m_pipeline.PrimitiveStep(
            primitive_description=ColumnParserPrimitive.metadata.query())
        step_2.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.1.produce')
        step_2.add_output('produce')
        pipeline.add_step(step_2)

        #step 3: Imputer
        step_3 = d3m_pipeline.PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.data_cleaning.imputer.SKlearn'))
        step_3.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_3.add_hyperparameter(name='use_semantic_types',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=True)
        step_3.add_output('produce')
        pipeline.add_step(step_3)

        #step 4: Extract attributes from dataset into a dedicated dataframe
        step_4 = d3m_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_4.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.3.produce')
        step_4.add_output('produce')
        step_4.add_hyperparameter(
            name='semantic_types',
            argument_type=d3m_base.ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
        pipeline.add_step(step_4)

        #step 5: Binary encoding for categorical features
        step_5 = d3m_pipeline.PrimitiveStep(
            primitive_description=BinaryEncoderPrimitive.metadata.query())
        step_5.add_hyperparameter(name='min_binary',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=2)
        step_5.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.4.produce')
        step_5.add_output('produce')
        pipeline.add_step(step_5)

        #step 5: Extract Targets
        step_6 = d3m_pipeline.PrimitiveStep(
            primitive_description=ExtractColumnsBySemanticTypesPrimitive.
            metadata.query())
        step_6.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.2.produce')
        step_6.add_hyperparameter(
            name='semantic_types',
            argument_type=d3m_base.ArgumentType.VALUE,
            data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        step_6.add_output('produce')
        pipeline.add_step(step_6)

        #step 7: transform targets dataframe into an ndarray
        step_7 = d3m_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_7.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.6.produce')
        step_7.add_output('produce')
        pipeline.add_step(step_7)

        #step 8 : transform features dataframe into an ndarray
        step_8 = d3m_pipeline.PrimitiveStep(
            primitive_description=DataFrameToNDArrayPrimitive.metadata.query())
        step_8.add_argument(name='inputs',
                            argument_type=d3m_base.ArgumentType.CONTAINER,
                            data_reference='steps.5.produce')
        step_8.add_output('produce')
        pipeline.add_step(step_8)
        attributes = 'steps.8.produce'
        targets = 'steps.7.produce'

        #step 9: call RFMPreconditionedGaussianKRR for regression
        #Run SparsePCA
        step_9 = d3m_pipeline.PrimitiveStep(
            primitive_description=SparsePCA.metadata.query())
        step_9.add_argument(
            name='inputs',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=attributes  #inputs here are the outputs from step 7
        )
        step_9.add_hyperparameter(name='n_components',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=4)
        step_9.add_hyperparameter(name='beta',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=1e-8)
        step_9.add_hyperparameter(name='alpha',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=1e-3)
        step_9.add_hyperparameter(name='degree',
                                  argument_type=d3m_base.ArgumentType.VALUE,
                                  data=2)
        step_9.add_output('produce')
        pipeline.add_step(step_9)

        #step 10: convert numpy-formatted prediction outputs to a dataframe
        step_10 = d3m_pipeline.PrimitiveStep(
            primitive_description=NDArrayToDataFramePrimitive.metadata.query())
        step_10.add_argument(name='inputs',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.9.produce')
        step_10.add_output('produce')
        pipeline.add_step(step_10)

        #step 11: convert numpy-formatted prediction outputs to a dataframe
        step_11 = d3m_pipeline.PrimitiveStep(
            primitive_description=HorizontalConcatPrimitive.metadata.query())
        step_11.add_argument(name='left',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.10.produce')
        step_11.add_argument(name='right',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.5.produce')
        step_11.add_output('produce')
        pipeline.add_step(step_11)

        #Linear Regression on low-rank data (inputs and outputs for sklearns are both dataframes)
        step_12 = d3m_pipeline.PrimitiveStep(
            primitive_description=d3m.primitives.regression.gradient_boosting.
            SKlearn.metadata.query())
        step_12.add_argument(name='inputs',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.11.produce')
        step_12.add_argument(name='outputs',
                             argument_type=d3m_base.ArgumentType.CONTAINER,
                             data_reference='steps.6.produce')
        step_12.add_hyperparameter(name='n_estimators',
                                   argument_type=d3m_base.ArgumentType.VALUE,
                                   data=10000)
        step_12.add_hyperparameter(name='learning_rate',
                                   argument_type=d3m_base.ArgumentType.VALUE,
                                   data=0.001)
        step_12.add_hyperparameter(name='max_depth',
                                   argument_type=d3m_base.ArgumentType.VALUE,
                                   data=2)
        step_12.add_output('produce')
        pipeline.add_step(step_12)

        #step 13: generate a properly-formatted output dataframe from the dataframed prediction outputs using the input dataframe as a reference
        step_13 = d3m_pipeline.PrimitiveStep(
            primitive_description=ConstructPredictionsPrimitive.metadata.query(
            ))
        step_13.add_argument(
            name='inputs',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=
            'steps.12.produce'  #inputs here are the prediction column
        )
        step_13.add_argument(
            name='reference',
            argument_type=d3m_base.ArgumentType.CONTAINER,
            data_reference=
            'steps.1.produce'  #inputs here are the dataframe input dataset
        )
        step_13.add_output('produce')
        pipeline.add_step(step_13)

        # Final Output
        pipeline.add_output(name='output', data_reference='steps.13.produce')

        return pipeline
Example #24
0
    def generate_ensemble_pipeline(self):
        """
            Function used to generate the Pipeline for ensemble tuning
        """
        if not self.pids:
            raise ValueError(
                "No candidate pipeline ids found, unable to generate the ensemble pipeline."
            )
        elif len(self.pids) == 1:
            raise ValueError(
                "Only 1 candidate pipeline id found, unable to generate the ensemble pipeline."
            )

        step_outputs = []
        self.voting_pipeline = pipeline_module.Pipeline(
            'voting', context=pipeline_module.PipelineContext.TESTING)
        pipeline_input = self.voting_pipeline.add_input(name='inputs')

        for each_pid in self.pids:
            each_dsbox_fitted = FittedPipeline.load(self.pipeline_files_dir,
                                                    each_pid)
            each_runtime = each_dsbox_fitted.runtime
            each_fitted = runtime_module.FittedPipeline(
                each_pid,
                each_runtime,
                context=pipeline_module.PipelineContext.TESTING)
            each_step = pipeline_module.FittedPipelineStep(
                each_fitted.id, each_fitted)
            each_step.add_input(pipeline_input)
            self.voting_pipeline.add_step(each_step)
            step_outputs.append(each_step.add_output('output'))

        concat_step = pipeline_module.PrimitiveStep({
            "python_path":
            "d3m.primitives.data_preprocessing.vertical_concatenate.DSBOX",
            "id":
            "dsbox-vertical-concat",
            # "version": "1.3.0",
            "name":
            "DSBox vertically concat"
        })

        concat_step_output = None
        for i in range(len(self.pids) - 1):
            each_concact_step = copy.deepcopy(concat_step)
            if i == 0:
                each_concact_step.add_argument(
                    name='inputs1',
                    argument_type=pipeline_module.ArgumentType.CONTAINER,
                    data_reference=step_outputs[i])
            else:
                each_concact_step.add_argument(
                    name='inputs1',
                    argument_type=pipeline_module.ArgumentType.CONTAINER,
                    data_reference=concat_step_output)
            each_concact_step.add_argument(
                name='inputs2',
                argument_type=pipeline_module.ArgumentType.CONTAINER,
                data_reference=step_outputs[i + 1])

            self.voting_pipeline.add_step(each_concact_step)

            # update concat_step_output
            concat_step_output = each_concact_step.add_output('produce')

        vote_step = pipeline_module.PrimitiveStep({
            "python_path":
            "d3m.primitives.data_preprocessing.ensemble_voting.DSBOX",
            "id":
            "dsbox-ensemble-voting",
            "version":
            "1.3.0",
            "name":
            "DSBox ensemble voting"
        })

        vote_step.add_argument(
            name='inputs',
            argument_type=pipeline_module.ArgumentType.CONTAINER,
            data_reference=concat_step_output)
        self.voting_pipeline.add_step(vote_step)
        voting_output = vote_step.add_output('produce')

        self.voting_pipeline.add_output(name='Metafeatures',
                                        data_reference=voting_output)
        self._logger.info("Ensemble pipeline created successfully")