Exemple #1
0
def create_pipeline(metric: str) -> Pipeline:

    # create the basic pipeline
    kanine_pipeline = Pipeline(context=PipelineContext.TESTING)
    kanine_pipeline.add_input(name='inputs')

    # Denormalize so that we have a single dataframe in the dataset
    step = PrimitiveStep(
        primitive_description=DenormalizePrimitive.metadata.query())
    step.add_argument(name='inputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='inputs.0')
    step.add_output('produce')
    kanine_pipeline.add_step(step)

    # step 1 - kanine classification
    step = PrimitiveStep(primitive_description=Kanine.metadata.query())
    step.add_argument(name='inputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='steps.0.produce')
    step.add_argument(name='outputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='steps.0.produce')
    step.add_output('produce')
    kanine_pipeline.add_step(step)

    # Adding output step to the pipeline
    kanine_pipeline.add_output(name='output', data_reference='steps.1.produce')

    return kanine_pipeline
Exemple #2
0
def _prepend_pipeline(base: pipeline.Pipeline,
                      prepend: pipeline.Pipeline) -> pipeline.Pipeline:
    # wrap pipeline in a sub pipeline - d3m core node replacement function doesn't work otherwise
    subpipeline = pipeline.SubpipelineStep(pipeline=base)

    # If there isn't a placeholder, return the prepended pipe
    if not [
            True
            for s in prepend.steps if isinstance(s, pipeline.PlaceholderStep)
    ]:
        return prepend

    # find the placeholder node in the prepend and replace it with the base sub pipeline
    for i, step in enumerate(prepend.steps):
        if isinstance(step, pipeline.PlaceholderStep):
            # set inputs/outputs manually since the replace doesn't infer them
            for input_ref in step.inputs:
                subpipeline.add_input(input_ref)
            for output_id in step.outputs:
                subpipeline.add_output(output_id)

            prepend.replace_step(i, subpipeline)
            return prepend

    logger.warn(
        f'Failed to prepend pipeline {prepend.id} - continuing with base unmodified'
    )
    return base
def create_pipeline(metric: str) -> Pipeline:

    # create the basic pipeline
    vertex_nomination_pipeline = Pipeline(context=PipelineContext.TESTING)
    vertex_nomination_pipeline.add_input(name='inputs')

    # step 0 - extract the graphs
    step = PrimitiveStep(primitive_description=DistilSingleGraphLoaderPrimitive.metadata.query())
    step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
    step.add_output('produce')
    step.add_output('produce_target')
    vertex_nomination_pipeline.add_step(step)

    # step 1 - predict links
    step = PrimitiveStep(primitive_description=DistilLinkPredictionPrimitive.metadata.query())
    step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
    step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce_target')
    step.add_hyperparameter('metric', ArgumentType.VALUE, metric)
    step.add_output('produce')
    vertex_nomination_pipeline.add_step(step)

    # Adding output step to the pipeline
    vertex_nomination_pipeline.add_output(name='output', data_reference='steps.1.produce')

    return vertex_nomination_pipeline
def create_pipeline(metric: str) -> Pipeline:

    # create the basic pipeline
    vertex_classification_pipeline = Pipeline(context=PipelineContext.TESTING)
    vertex_classification_pipeline.add_input(name='inputs')

    # step 0 - extract the graphs
    step = PrimitiveStep(
        primitive_description=VertexClassificationParser.metadata.query())
    step.add_argument(name='inputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='inputs.0')
    step.add_output('produce')
    vertex_classification_pipeline.add_step(step)

    # step 1 - classify
    step = PrimitiveStep(
        primitive_description=VertexClassification.metadata.query())
    step.add_argument(name='inputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='steps.0.produce')
    step.add_hyperparameter('jvm_memory', ArgumentType.VALUE, 0.6)
    step.add_output('produce')
    vertex_classification_pipeline.add_step(step)

    # Adding output step to the pipeline
    vertex_classification_pipeline.add_output(name='output',
                                              data_reference='steps.1.produce')

    return vertex_classification_pipeline
def load_pipeline(pipeline):
    with open(pipeline) as _pipeline:
        if pipeline.endswith('.json'):
            pipeline = Pipeline.from_json(_pipeline)
        else:
            pipeline = Pipeline.from_yaml(_pipeline)

    return pipeline
    def create_pipeline_json(self, prim_dict):
        """
        Generate pipeline.json
        """
        name = "Pipeline for evaluation"
        pipeline_id = self.id #+ "_" + str(self.rank)
        pipeline_description = Pipeline(pipeline_id=pipeline_id, context=Context.EVALUATION, name=name)
        for ip in self.inputs:
            pipeline_description.add_input(name=ip['name'])

        num = self.num_steps()
        for i in range(num):
            p = prim_dict[self.primitives[i]]
            pdesc = {}
            pdesc['id'] = p.id
            pdesc['version'] = p.primitive_class.version
            pdesc['python_path'] = p.primitive_class.python_path
            pdesc['name'] = p.primitive_class.name
            pdesc['digest'] = p.primitive_class.digest
            step = PrimitiveStep(primitive_description=pdesc)

            for name, value in self.primitives_arguments[i].items():
                origin = value['origin']
                argument_type = ArgumentType.CONTAINER
                step.add_argument(name=name, argument_type=argument_type, data_reference=value['data'])
            step.add_output(output_id=p.primitive_class.produce_methods[0])
            if self.hyperparams[i] is not None:
                for name, value in self.hyperparams[i].items():
                    step.add_hyperparameter(name=name, argument_type=ArgumentType.VALUE, data=value)
            pipeline_description.add_step(step)

        for op in self.outputs:
            pipeline_description.add_output(data_reference=op[2], name=op[3])

        self.pipeline_description = pipeline_description
Exemple #7
0
    def __setstate__(self, state: typing.Dict) -> None:
        """
        This method is used for unpickling the object. It takes a dictionary
        of saved state of object and restores the object to that state.
        Args:
            state: typing.Dict
                dictionary of the objects picklable state
        Returns:
        """

        # print("[INFO] Set state called!")

        fitted = state['fitted_pipe']
        del state['fitted_pipe']

        structure = state['pipeline']
        state['pipeline'] = Pipeline.from_json_structure(structure)
        random_seed = state['random_seed']

        run = Runtime(state['pipeline'],
                      fitted_pipeline_id=state['id'],
                      random_seed=random_seed,
                      volumes_dir=FittedPipeline.runtime_setting.volumes_dir,
                      log_dir=FittedPipeline.runtime_setting.log_dir)
        run.steps_state = fitted

        state['runtime'] = run

        self.__dict__ = state
Exemple #8
0
def load_pipeline(pipeline_file: typing.Union[str, typing.Dict]):
    """
    Load pipeline from a pipeline URI

    Parameters
    ----------
    pipeline_file: Union[str, dict]
        The URI pointing to a json file of pipeline or dict of string that is a pipeline

    Returns
    -------
    pipeline: Pipeline
        An object of Pipeline

    """
    if isinstance(pipeline_file, dict):
        try:
            with d3m_utils.silence():
                pipeline = Pipeline.from_json_structure(pipeline_file)
        except:
            pipeline = None
    else:
        with d3m_utils.silence():
            pipeline = get_pipeline(pipeline_path=pipeline_file,
                                    load_all_primitives=False)
    return pipeline
Exemple #9
0
def load_pipeline(path, tunables=True, defaults=True):
    """Load a d3m json or yaml pipeline."""

    if not os.path.exists(path):
        base_path = os.path.abspath(os.path.dirname(__file__))

        path = os.path.join('templates', path)
        path = os.path.join(base_path, path)

    if not os.path.isfile(path):
        raise ValueError('Could not find pipeline: {}'.format(path))

    LOGGER.warn('Loading pipeline from %s', path)
    with open(path) as pipeline:
        if path.endswith('yml'):
            data = yaml.safe_load(pipeline)

        else:
            data = json.load(pipeline)

    pipeline = Pipeline.from_json_structure(data)

    if tunables:
        # extract tunable hyperparameters
        tunable_hyperparameters = extract_tunable_hyperparams(pipeline)

        return pipeline, tunable_hyperparameters

    return pipeline
Exemple #10
0
def generate_pipeline(pipeline_path: str,
                      dataset_path: str,
                      problem_doc_path: str,
                      resolver: Resolver = None) -> Runtime:
    """
    Simplified interface that fit a pipeline with a dataset

    Paramters
    ---------
    pipeline_path
        Path to the pipeline description
    dataset_path:
        Path to the datasetDoc.json
    problem_doc_path:
        Path to the problemDoc.json
    resolver : Resolver
        Resolver to use.
    """

    # Pipeline description
    pipeline_description = None
    if '.json' in pipeline_path:
        with open(pipeline_path) as pipeline_file:
            pipeline_description = Pipeline.from_json(
                string_or_file=pipeline_file, resolver=resolver)
    else:
        with open(pipeline_path) as pipeline_file:
            pipeline_description = Pipeline.from_yaml(
                string_or_file=pipeline_file, resolver=resolver)

    # Problem Doc
    problem_doc = load_problem_doc(problem_doc_path)

    # Dataset
    if 'file:' not in dataset_path:
        dataset_path = 'file://{dataset_path}'.format(
            dataset_path=os.path.abspath(dataset_path))

    dataset = D3MDatasetLoader().load(dataset_uri=dataset_path)
    # Adding Metadata to Dataset
    dataset = add_target_columns_metadata(dataset, problem_doc)

    # Pipeline
    pipeline_runtime = Runtime(pipeline_description)
    # Fitting Pipeline
    pipeline_runtime.fit(inputs=[dataset])
    return pipeline_runtime
Exemple #11
0
    def _new_pipeline(pipeline, hyperparams=None):
        hyperparams = to_dicts(hyperparams) if hyperparams else dict()

        new_pipeline = Pipeline(context=Context.TESTING)
        for input_ in pipeline.inputs:
            new_pipeline.add_input(name=input_['name'])

        for step_id, old_step in enumerate(pipeline.steps):
            new_step = PrimitiveStep(primitive=old_step.primitive)
            for name, argument in old_step.arguments.items():
                new_step.add_argument(
                    name=name,
                    argument_type=argument['type'],
                    data_reference=argument['data']
                )
            for output in old_step.outputs:
                new_step.add_output(output)

            new_hyperparams = hyperparams.get(str(step_id), dict())
            for name, hyperparam in old_step.hyperparams.items():
                if name not in new_hyperparams:
                    new_step.add_hyperparameter(
                        name=name,
                        argument_type=ArgumentType.VALUE,
                        data=hyperparam['data']
                    )

            for name, value in new_hyperparams.items():
                new_step.add_hyperparameter(
                    name=name,
                    argument_type=ArgumentType.VALUE,
                    data=value
                )

            new_pipeline.add_step(new_step)

        for output in pipeline.outputs:
            new_pipeline.add_output(
                name=output['name'],
                data_reference=output['data']
            )

        new_pipeline.cv_scores = list()
        new_pipeline.score = None

        return new_pipeline
Exemple #12
0
def generate_template(pipeline_file: str) -> dict:
    with open(pipeline_file) as f:
        pipeline = Pipeline.from_json(f)

    steps = []
    for i, step in enumerate(pipeline.steps):
        if not isinstance(step, PrimitiveStep):
            raise ValueError('Can only handle PrimitiveSteps')
        step_name = f'steps.{i}'
        hyperparameters = {}
        for name, value in step.hyperparams.items():
            if value['type'] == ArgumentType.VALUE:
                hyperparameters[name] = value['data']
            else:
                raise ValueError(
                    f'Do not know how to parse hyperparam: {str(value)}')
        arguments = []
        argument_keys = set(step.arguments.keys())
        for argument_name in ['inputs', 'outputs', 'reference']:
            if argument_name in argument_keys:
                argument_keys.remove(argument_name)
                if step.arguments[argument_name][
                        'type'] == ArgumentType.CONTAINER:
                    if step.arguments[argument_name]['data'] == 'inputs.0':
                        arguments.append('template_input')
                    elif step.arguments[argument_name]['data'].startswith(
                            'steps.') and step.arguments['inputs'][
                                'data'].endswith('.produce'):
                        arguments.append(
                            step.arguments[argument_name]['data'][:-8])
                    else:
                        raise ValueError(
                            f"Do not know how to parse argument: {step.arguments['inputs']['data']}"
                        )
                else:
                    raise ValueError(
                        f"Do not know how to parse argument type: {step.arguments['inputs']['type']}"
                    )
        if len(argument_keys) > 0:
            for argument_name in argument_keys:
                print(argument_name, step.arguments[argument_name])
            raise ValueError(f"Unused arguments: {argument_keys}")
        primitive = OrderedDict()
        primitive['primitive'] = str(step.primitive)
        primitive['hyperparameters'] = hyperparameters
        step = OrderedDict()
        step['name'] = step_name
        step['primitives'] = [primitive]
        step['inputs'] = arguments
        steps.append(step)
    template = OrderedDict()
    template['name'] = pipeline.id if pipeline.name is None else pipeline.name
    template['taskType'] = {'TaskType'}
    template['taskSubtype'] = {'TaskSubtype'}
    template['inputType'] = {'table'}
    template['output'] = step_name
    template['steps'] = steps
    return template
Exemple #13
0
def create_pipeline(metric: str) -> Pipeline:

    # create the basic pipeline
    graph_matching_pipeline = Pipeline(context=PipelineContext.TESTING)
    graph_matching_pipeline.add_input(name='inputs')

    # step 0 - extract the graphs
    step = PrimitiveStep(
        primitive_description=DistilGraphLoaderPrimitive.metadata.query())
    step.add_argument(name='inputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='inputs.0')
    step.add_output('produce')
    step.add_output('produce_target')
    graph_matching_pipeline.add_step(step)

    # step 1 - match the graphs that have been seeded
    step = PrimitiveStep(
        primitive_description=DistilSeededGraphMatchingPrimitive.metadata.
        query())
    step.add_argument(name='inputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='steps.0.produce')
    step.add_argument(name='outputs',
                      argument_type=ArgumentType.CONTAINER,
                      data_reference='steps.0.produce_target')
    step.add_hyperparameter('metric', ArgumentType.VALUE, metric)
    step.add_output('produce')
    graph_matching_pipeline.add_step(step)

    # convert predictions to expected format
    #step = PrimitiveStep(primitive_description=ConstructPredictionsPrimitive.metadata.query())
    #step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
    #step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce_target')
    #step.add_output('produce')
    #step.add_hyperparameter('use_columns', ArgumentType.VALUE, [0, 1])
    #graph_matching_pipeline.add_step(step)

    # Adding output step to the pipeline
    graph_matching_pipeline.add_output(name='output',
                                       data_reference='steps.1.produce')

    return graph_matching_pipeline
Exemple #14
0
def load_template():
    with open(
            join(os.path.dirname(__file__),
                 '../resource/pipelines/example_metalearningdb.json')) as fin:
        json_pipeline = json.load(fin)

    d3m_pipeline = Pipeline.from_json_structure(json_pipeline, )
    grpc_pipeline = encode_pipeline_description(d3m_pipeline, ['RAW'], '/tmp')

    return grpc_pipeline
Exemple #15
0
    def build_pipeline(self, hyperparameters):
        """
            hyperparameters example:
                {
                    'STEP5/d3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization/max_percent_null: 0,
                    'STEP7/d3m.primitives.data_preprocessing.robust_scaler.SKlearn/quantile_range: (2.798121390864261, 14.852664215409096),
                }
        """
        values = hyperparameters.values
        pipeline_id = hyperparameters.get_pipeline_id()
        pipeline = copy.deepcopy(self.pipeline_candidates[pipeline_id])
        pipeline.id = str(uuid.uuid4())
        # update time
        pipeline.created = Pipeline().created

        skip_hps = set()
        # for key in sorted(values.keys()):
        for hp in hyperparameters.space:
            if hyperparameters.is_active(hp) and hp.name not in skip_hps and hp.name != PIPELINE_CHOICE:
                key = hp.name
                step, primitive_name, hp_name = hyperparameters.get_name_parts(key)
                value = values[key]
                step_idx = self.__get_step_idx_by_name(step)
                if step_idx is None:
                    raise KeyError('{} not in the pipeline'.format(primitive_name))
                primitive_step = pipeline.steps[step_idx]
                arg_type = ArgumentType.VALUE
                # In order to avoid the following error
                # Value '0' for hyper-parameter \
                # 'STEP8/d3m.primitives.classification.xgboost_gbtree.DataFrameCommon/max_delta_step' \
                # is not an instance of the structural type: typing.Union[int, NoneType]
                # Here is workaround
                if isinstance(value, np.int64):
                    value = int(value)
                elif isinstance(value, np.str_):
                    value = str(value)
                elif isinstance(value, np.bool_):
                    value = bool(value)
                if hp_name in primitive_step.hyperparams:
                    del primitive_step.hyperparams[hp_name]
                # Handle Choice
                if isinstance(hp, hyperparams.Choice):
                    choice_cls = hp.choices[value]
                    _vals = {}
                    for name in choice_cls.configuration:
                        if name == 'choice':
                            _vals[name] = value
                        else:
                            _key = os.path.join(step, primitive_name, name)
                            _vals[name] = values[_key]
                            skip_hps.add(_key)
                    value = choice_cls(_vals)
                primitive_step.add_hyperparameter(name=hp_name, argument_type=arg_type,
                                                  data=value)
        return pipeline
Exemple #16
0
 def __init__(self,
              metadata,
              main_resource,
              data_types,
              loaded_primitives,
              problem=None,
              start_resource='inputs.0'):
     self.metadata = metadata
     self.main_resource = main_resource
     self.data_types = data_types
     self.loaded_primitives = loaded_primitives
     self.start_resource = start_resource
     self.problem = problem
     # Creating pipeline
     pipeline_description = Pipeline(context=Context.TESTING)
     pipeline_description.add_input(name='inputs')
     self.pipeline = pipeline_description
     self.d2d_step = None
     self.attr_step = None
     self.targ_step = None
     self._generate_pipeline()
Exemple #17
0
def evaluate(pipeline, data_pipeline, dataset, metrics, problem,
             scoring_config, dataset_uri, timeout_run):
    if is_collection(dataset_uri[7:]):
        dataset = get_dataset_sample(dataset, problem)

    json_pipeline = convert.to_d3m_json(pipeline)

    if TaskKeyword.GRAPH in problem['problem'][
            'task_keywords'] and json_pipeline['description'].startswith(
                'MtLDB'):
        return {0: {'ACCURACY': 1.0}, 1: {'ACCURACY': 1.0}}

    logger.info(
        "Pipeline to be scored:\n\t%s", '\n\t'.join(
            [x['primitive']['python_path'] for x in json_pipeline['steps']]))

    d3m_pipeline = Pipeline.from_json_structure(json_pipeline, )
    if 'method' in scoring_config:
        scoring_config.pop('method')

    manager = Manager()
    return_dict = manager.dict()
    p = Process(target=worker,
                args=(d3m_pipeline, data_pipeline, scoring_pipeline, problem,
                      dataset, scoring_config, metrics, return_dict))
    p.start()
    p.join(timeout_run)
    p.terminate()

    if 'run_results' not in return_dict or 'run_scores' not in return_dict:
        raise TimeoutError('Reached timeout (%d seconds) to score a pipeline' %
                           timeout_run)

    run_results = return_dict['run_results']
    run_scores = return_dict['run_scores']

    for result in run_results:
        if result.has_error():
            raise RuntimeError(result.pipeline_run.status['message'])

    #save_pipeline_runs(run_results.pipeline_runs)
    combined_folds = d3m.runtime.combine_folds([fold for fold in run_scores])
    scores = {}

    for _, row in combined_folds.iterrows():
        if row['fold'] not in scores:
            scores[row['fold']] = {}
        scores[row['fold']][row['metric']] = row['value']

    return scores
Exemple #18
0
    def DescribeSolution(self, request, context):

        solution_id = request.solution_id
        info_dict = self.get_from_stage_outputs("GetSearchSolutionsResults",
                                                solution_id)

        # Serialize the pipeline
        pipeline_json = info_dict["pipeline_json"]
        allowed_value_types = info_dict["allowed_value_types"]
        pipeline = Pipeline.from_json(pipeline_json)
        pipeline_description = ta3ta2utils.encode_pipeline_description(
            pipeline, allowed_value_types, "/tmp")

        return core_pb2.DescribeSolutionResponse(pipeline=pipeline_description)
Exemple #19
0
def keras2pipeline(keras_model, batch_size=32):
    # Creating pipeline
    from tensorflow.python.keras.activations import softmax
    pipeline_description = Pipeline()

    pipeline_description.add_input(name='inputs')

    set_data(pipeline_description)
    set_loss(pipeline_description)

    offset = len(pipeline_description.steps)

    previous_layer_ids = get_previous_layer_ids(keras_model)

    layers = keras_model.layers

    step_id = 0
    layer_to_step_id = {}

    total_layer_num = len(layers)
    for i, layer in enumerate(layers):
        cls_name = get_layer_class_name(layer)
        if cls_name in OMIT_LAYERS:
            continue
        layer_id = get_layer_id(layer)
        if len(previous_layer_ids[layer_id]) > 0:
            layer.previous_layer_ids = tuple(
                layer_to_step_id[i] + offset for i in previous_layer_ids[layer_id]
            )
        else:
            layer.previous_layer_ids = [None]
        # Since JPL does not support Softmax Layer, we add the workaround to make use of softmax
        if i == total_layer_num - 2 and cls_name == 'Dense':
            layer.activation = softmax
        d3m_step = step_function[cls_name](step_id, layer)
        pipeline_description.add_step(d3m_step)
        layer_to_step_id[layer_id] = step_id
        step_id += 1

    set_learner(pipeline_description, batch_size)
    set_prediction(pipeline_description)
    pipeline_description.add_output(
        name='output predictions', data_reference=f"steps.{len(pipeline_description.steps) - 1}.produce")

    return pipeline_description
Exemple #20
0
    def load_pipeline_architecture(self, pipeline_architecture_dict):
        """
        Loads pipeline architecture dictionary and returns a d3m Pipeline object.

        Return pipeline 
        """

        pipeline_description = Pipeline(context=Context.TESTING)
        pipeline_description.add_input(name='inputs')

        # For each corresponding stage in the dictionary create a step
        steps = []
        stage_name_to_reference_name = {}
        for stage_dict in pipeline_architecture_dict:
            
            # Extract stage attributes
            primitive = stage_dict["primitive"]
            if type(primitive) == str:
                primitive = get_primitive_with_name(primitive)
            cur_stage_name = stage_dict["stage_name"]
            input_stage = stage_dict["input"]
            
            # Create primitive step
            step = PrimitiveStep(primitive_description=primitive.metadata.query())
            data_reference = "inputs.0" if input_stage == PipelineWrapper.PIPELINE_INPUT else stage_name_to_reference_name[input_stage]            
            step.add_argument(name="inputs", argument_type=ArgumentType.CONTAINER, data_reference=data_reference)
            if "hyperparameters" in stage_dict:
                for k,v in stage_dict["hyperparameters"].items():
                    step.add_hyperparameter(name=k, argument_type=ArgumentType.VALUE, data=v)
            if "arguments" in stage_dict:
                for k,v in stage_dict["arguments"].items():
                    step.add_argument(name=k, argument_type=ArgumentType.CONTAINER, data_reference=stage_name_to_reference_name[v])
            step.add_output("produce")
            pipeline_description.add_step(step)
            reference_name = next(iter(step.get_output_data_references()))

            # Update accounting
            stage_name_to_reference_name[cur_stage_name] = reference_name
            steps.append(step)

        # Output is output of the last step
        last_output_reference = next(iter(steps[-1].get_output_data_references()))
        pipeline_description.add_output(name="output", data_reference=last_output_reference)

        return pipeline_description
Exemple #21
0
    def load_schema_only(
            cls, pipeline_id: str, folder_loc: str, pipeline_schema_subdir: str
    ) -> typing.Tuple[Pipeline, typing.Dict]:
        pipeline_dir = os.path.join(folder_loc, pipeline_schema_subdir)
        subpipeline_dir = os.path.join(folder_loc, cls.subpipelines_subdir)

        pipeline_schema = os.path.join(pipeline_dir, pipeline_id + '.json')

        with open(pipeline_schema, 'r') as f:
            structure = json.load(f)

        resolver = Resolver(
            pipeline_search_paths=[pipeline_dir, subpipeline_dir])
        pipeline = Pipeline.from_json_structure(pipeline_description=structure,
                                                resolver=resolver)
        return (pipeline, structure)
    def score(self, input_item):
        problem_doc, metric, pipeline_json, dataset_train, dataset_test = input_item[
            1:]

        # Run pipeline
        pipeline = Pipeline.from_json(pipeline_json)
        pipeline_runtime = Runtime(pipeline, context=Context.TESTING)
        pipeline_runtime.fit(inputs=[dataset_train],
                             return_values=['outputs.0'])
        score_predictions = pipeline_runtime.produce(
            inputs=[dataset_test], return_values=['outputs.0'])
        score_predictions = score_predictions.values['outputs.0']

        # Evaluate scores on score dir
        achieved_score = utils.train_utils.score(score_predictions,
                                                 dataset_test,
                                                 problem_doc,
                                                 override_metric_key=metric)
        return achieved_score
    def fitproduce(self, input_item):
        problem_doc, pipeline_json, dataset_train, dataset_test = input_item[
            1:]

        # Run pipeline
        pipeline = Pipeline.from_json(pipeline_json)
        pipeline_runtime = Runtime(pipeline, context=Context.TESTING)
        pipeline_runtime.fit(inputs=[dataset_train],
                             return_values=['outputs.0'])
        score_predictions = pipeline_runtime.produce(
            inputs=[dataset_test], return_values=['outputs.0'])
        score_predictions = score_predictions.values['outputs.0']

        # Write predictions to output path
        path = self.get_predictions_save_path()
        utils.utils.write_predictions_to_file(score_predictions, path,
                                              problem_doc)
        path_uri = "file://%s" % path
        return path_uri
Exemple #24
0
    def _return_pipelines(self, task_type, task_subtype, data_type):
        """
        A function that return predefined pipelines given a task type.

        Returns
        -------
            A predefined pipelines if there are pipelines left, also if there is template
            returns the new pipeline with the template.

        """
        # TODO incorporate task_subtype and data_type for future problems
        with open(schemas_utils.PIPELINES_DB_DIR) as file:
            possible_pipelines_dict = json.load(file)

        if task_type not in possible_pipelines_dict:
            self.pipeline_left = False
            return None

        possible_pipelines_dict = possible_pipelines_dict[task_type]

        if not possible_pipelines_dict:
            return []

        possible_pipelines = []
        for pipeline_dict in possible_pipelines_dict:
            try:
                pipeline = pipeline_utils.load_pipeline(pipeline_dict)

                # update id
                pipeline.id = str(uuid.uuid4())

                # update time
                pipeline.created = Pipeline().created

                possible_pipelines.append(pipeline)
            except Exception:
                pass

        return possible_pipelines
from d3m import index
from d3m.metadata.base import ArgumentType, Context
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
import sys

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: Denormalize primitive
step_0 = PrimitiveStep(primitive=index.get_primitive(
    'd3m.primitives.data_transformation.denormalize.Common'))
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: DISTIL/NK Storc primitive
step_1 = PrimitiveStep(
    primitive=index.get_primitive('d3m.primitives.clustering.k_means.Sloth'))
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_hyperparameter(name='nclusters',
                          argument_type=ArgumentType.VALUE,
                          data=10)
step_1.add_hyperparameter(name='long_format',
                          argument_type=ArgumentType.VALUE,
                          data=True)
step_1.add_output('produce')
Exemple #26
0
def generate_only():
    # Creating pipeline
    pipeline_description = Pipeline()
    pipeline_description.add_input(name='inputs')

    # Step 0: dataset_to_dataframe
    step_0 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.data_transformation.dataset_to_dataframe.Common'))
    step_0.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='inputs.0')
    step_0.add_output('produce')
    pipeline_description.add_step(step_0)

    # Step 1: column_parser
    step_1 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.schema_discovery.profiler.Common'))
    step_1.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.0.produce')
    step_1.add_output('produce')
    pipeline_description.add_step(step_1)

    # Step 2: column_parser
    step_2 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.data_transformation.column_parser.Common'))
    step_2.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.1.produce')
    step_2.add_output('produce')
    pipeline_description.add_step(step_2)

    # Step 3: DFS Single Table
    step_3 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization'
    ))
    step_3.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.2.produce')
    step_3.add_output('produce')
    pipeline_description.add_step(step_3)

    # Step 4: learn model
    step_4 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.regression.xgboost_gbtree.Common'))
    step_4.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.3.produce')
    step_4.add_argument(name='outputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.2.produce')
    step_4.add_output('produce')
    pipeline_description.add_step(step_4)

    # Step 5: construct output
    step_5 = PrimitiveStep(primitive=index.get_primitive(
        'd3m.primitives.data_transformation.construct_predictions.Common'))
    step_5.add_argument(name='inputs',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.4.produce')
    step_5.add_argument(name='reference',
                        argument_type=ArgumentType.CONTAINER,
                        data_reference='steps.2.produce')
    step_5.add_output('produce')
    pipeline_description.add_step(step_5)

    # Final Output
    pipeline_description.add_output(name='output predictions',
                                    data_reference='steps.5.produce')

    # Generate .yml file for the pipeline
    import featuretools_ta1
    from pipeline_tests.utils import generate_pipeline
    dataset_name = 'LL1_retail_sales_total_MIN_METADATA'
    dataset_path = '/featuretools_ta1/datasets/seed_datasets_current'
    primitive_name = 'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization'
    version = featuretools_ta1.__version__
    test_name = os.path.splitext(os.path.basename(__file__))[0][5:]
    yml, pipeline_run_file = generate_pipeline(
        primitive_name=primitive_name,
        pipeline_description=pipeline_description,
        dataset_name=dataset_name,
        test_name=test_name)

    # fit-score command
    fs_cmd = 'python3 -m d3m runtime -d /featuretools_ta1/datasets/ fit-score -p {}'.format(
        yml)
    fs_cmd += ' -r {}/{}/{}_problem/problemDoc.json'.format(
        dataset_path, dataset_name, dataset_name)
    fs_cmd += ' -i {}/{}/TRAIN/dataset_TRAIN/datasetDoc.json'.format(
        dataset_path, dataset_name)
    fs_cmd += ' -t {}/{}/TEST/dataset_TEST/datasetDoc.json'.format(
        dataset_path, dataset_name)
    fs_cmd += ' -a {}/{}/SCORE/dataset_SCORE/datasetDoc.json'.format(
        dataset_path, dataset_name)
    fs_cmd += ' -O {}'.format(pipeline_run_file)

    # Run pipeline to save pipeline_run file
    os.system(fs_cmd)

    # Create and return command for running from pipeline_run file:
    pipeline_run_cmd = 'python3 -m d3m --pipelines-path /featuretools_ta1/MIT_FeatureLabs/{}/{}/pipelines/'.format(
        primitive_name, version)
    pipeline_run_cmd += ' runtime -d /featuretools_ta1/datasets/ fit-score'
    pipeline_run_cmd += ' -u {}'.format(pipeline_run_file)

    return pipeline_run_cmd
Exemple #27
0
    def __init__(self):

        pipeline_description = Pipeline()
        pipeline_description.add_input(name="inputs")

        # DS to DF on input DS
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.dataset_to_dataframe.Common"))
        step.add_argument(name="inputs",
                          argument_type=ArgumentType.CONTAINER,
                          data_reference="inputs.0")
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Simon
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_cleaning.column_type_profiler.Simon"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # column parser on input DF
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.column_parser.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # XG Boost
        step = PrimitiveStep(primitive=index.get_primitive(
            'd3m.primitives.classification.xgboost_gbtree.Common'))
        step.add_argument(name='inputs',
                          argument_type=ArgumentType.CONTAINER,
                          data_reference='steps.2.produce')
        step.add_argument(name='outputs',
                          argument_type=ArgumentType.CONTAINER,
                          data_reference='steps.2.produce')
        step.add_output('produce')
        step.add_hyperparameter(name='add_index_columns',
                                argument_type=ArgumentType.VALUE,
                                data=True)
        pipeline_description.add_step(step)

        # construct predictions
        step = PrimitiveStep(primitive=index.get_primitive(
            "d3m.primitives.data_transformation.construct_predictions.Common"))
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.3.produce",
        )
        step.add_argument(
            name="reference",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Final Output
        pipeline_description.add_output(name="output predictions",
                                        data_reference="steps.4.produce")

        self.pipeline = pipeline_description
Exemple #28
0
def build_pipeline(pipepline_info, pipepline_mapping, stdout=None):

    default_stdout = sys.stdout
    if stdout is not None:
        sys.stdout = stdout

    # Creating pipeline
    pipeline_description = Pipeline()
    pipeline_description.add_input(name='inputs')

    for primitive_info in pipepline_info:
        print(primitive_info.python_path)
        print(primitive_info.hyperparameter)
        print(primitive_info.ancestors)

        if primitive_info.python_path == 'HEAD':
            dataset_fullname = primitive_info.hyperparameter['dataset_folder']
            print(dataset_fullname)
            continue

        elif primitive_info.python_path == 'ENDING':

            ancestors = primitive_info.ancestors
            end_step_num = pipepline_mapping[ancestors['inputs']] - 1
            pipeline_description.add_output(name='output predictions', data_reference='steps.' + str(end_step_num) + '.produce')

        else:
            # print(primitive_info.python_path)
            primitive = index.get_primitive(primitive_info.python_path)
            step = PrimitiveStep(primitive=primitive)

            hyperparameters = primitive_info.hyperparameter
            ancestors = primitive_info.ancestors

            # add add_inputs
            # print(ancestors)

            if ancestors['inputs'] != 0:
                for ances_key in ancestors.keys():
                    print(ances_key, ancestors[ances_key], pipepline_mapping[ancestors[ances_key]] - 1)

                    step_num = pipepline_mapping[ancestors[ances_key]] - 1
                    step.add_argument(name=ances_key, argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(step_num) + '.produce')

            else:
                step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')

            # add add_hyperparameter
            for hyper in hyperparameters.keys():
                # print(hyper, hyperparameters[hyper], type(hyperparameters[hyper]))

                hyper_value = hyperparameters[hyper]

                step.add_hyperparameter(name=hyper, argument_type=ArgumentType.VALUE, data=hyper_value)

            step.add_output('produce')
            pipeline_description.add_step(step)

            # print('\n')

    # Output to json
    data = pipeline_description.to_json()
    with open('example_pipeline.json', 'w') as f:
        f.write(data)
        print(data)

    # yaml = pipeline_description.to_yaml()
    # with open('example_pipeline.yml', 'w') as f:
    #     f.write(yaml)
    # print(yaml)

    sys.stdout.flush()
    sys.stdout = default_stdout
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep
from d3m.metadata import hyperparams
import numpy as np

# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest
#                                             extract_columns_by_semantic_types(targets)    ->            ^

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
primitive_0 = index.get_primitive(
    'd3m.primitives.data_transformation.dataset_to_dataframe.Common')
step_0 = PrimitiveStep(primitive=primitive_0)
step_0.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# # Step 1: column_parser
primitive_1 = index.get_primitive(
    'd3m.primitives.data_transformation.column_parser.Common')
step_1 = PrimitiveStep(primitive=primitive_1)
step_1.add_argument(name='inputs',
                    argument_type=ArgumentType.CONTAINER,
                    data_reference='steps.0.produce')
step_1.add_output('produce')
    def __init__(
        self,
        epochs: int = 5000,
        attention_lstm: bool = True,
    ):

        pipeline_description = Pipeline()
        pipeline_description.add_input(name="inputs")

        # Ts formatter
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="inputs.0",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # DS to DF on formatted ts DS
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.dataset_to_dataframe.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.0.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # DS to DF on input DS
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.dataset_to_dataframe.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="inputs.0",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # column parser on input DF
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.column_parser.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.2.produce",
        )
        step.add_output("produce")
        step.add_hyperparameter(
            name="parse_semantic_types",
            argument_type=ArgumentType.VALUE,
            data=[
                "http://schema.org/Boolean",
                "http://schema.org/Integer",
                "http://schema.org/Float",
                "https://metadata.datadrivendiscovery.org/types/FloatVector",
            ],
        )
        pipeline_description.add_step(step)

        # parse target semantic types
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.3.produce",
        )
        step.add_hyperparameter(
            name="semantic_types",
            argument_type=ArgumentType.VALUE,
            data=[
                "https://metadata.datadrivendiscovery.org/types/Target",
            ],
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # LSTM FCN
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.time_series_classification.convolutional_neural_net.LSTM_FCN"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.1.produce",
        )
        step.add_argument(
            name="outputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.4.produce",
        )
        step.add_hyperparameter(
            name="epochs", argument_type=ArgumentType.VALUE, data=epochs
        )
        step.add_hyperparameter(
            name="attention_lstm", argument_type=ArgumentType.VALUE, data=attention_lstm
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # construct predictions
        step = PrimitiveStep(
            primitive=index.get_primitive(
                "d3m.primitives.data_transformation.construct_predictions.Common"
            )
        )
        step.add_argument(
            name="inputs",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.5.produce",
        )
        step.add_argument(
            name="reference",
            argument_type=ArgumentType.CONTAINER,
            data_reference="steps.2.produce",
        )
        step.add_output("produce")
        pipeline_description.add_step(step)

        # Final Output
        pipeline_description.add_output(
            name="output predictions", data_reference="steps.6.produce"
        )

        self.pipeline = pipeline_description