Beispiel #1
0
    def run(self, input_dataset, eval_datasets=[], return_pipeline=False):
        """
        Converts internal pipeline architecture dict into pipeline and runs it.
        
        Args:
        - input_dataset: Input dataset to train
        - eval_dataset: Dataset to evaluate
        - return_pipeline: Whether to return the pipeline which fitted and produced the preds

        Returns:
        - If return_pipeline is False, returns just the predictions, otherwise returns a tuple 
          (preds, pipeline)
        """

        pipeline = self.load_pipeline_architecture(self.pipeline_architecture_dict)
        pipeline.check()

        runtime = Runtime(pipeline, context=Context.TESTING)
        runtime.fit(inputs=[input_dataset], return_values=['outputs.0'])

        all_preds = []
        for dataset in eval_datasets:
            all_preds.append(runtime.produce(inputs=[dataset], return_values=['outputs.0']))
            
        results = all_preds
        if return_pipeline:
            results = (all_preds, pipeline)

        return results
Beispiel #2
0
def score_pipeline(dataset_root, problem, pipeline_path):
    train_dataset = load_dataset(dataset_root, 'TRAIN')
    test_dataset = load_dataset(dataset_root, 'SCORE', 'TEST')
    pipeline = load_pipeline(pipeline_path)

    # Creating an instance on runtime with pipeline description and problem description.
    runtime = Runtime(
        pipeline=pipeline,
        problem_description=problem,
        context=Context.TESTING
    )

    print("Fitting the pipeline")
    fit_results = runtime.fit(inputs=[train_dataset])
    fit_results.check_success()

    # Producing results using the fitted pipeline.
    print("Producing predictions")
    produce_results = runtime.produce(inputs=[test_dataset])
    produce_results.check_success()

    predictions = produce_results.values['outputs.0']
    metrics = problem['problem']['performance_metrics']

    print("Computing the score")
    scoring_pipeline = load_pipeline('ta2/pipelines/scoring_pipeline.yml')
    scores, scoring_pipeline_run = score(
        scoring_pipeline, problem, predictions, [test_dataset], metrics,
        context=Context.TESTING, random_seed=0,
    )
    return scores.iloc[0].value
Beispiel #3
0
def run_pipeline(pipeline, dataset_name, datasets_path):
    ensure_downloaded(dataset_name, datasets_path)

    root_path = os.path.join(os.path.abspath(datasets_path), dataset_name)
    train_dataset = load_dataset(root_path, 'TRAIN')
    train_problem = load_problem(root_path, 'TRAIN')

    # Creating an instance on runtime with pipeline description and problem description.
    runtime = Runtime(pipeline=pipeline,
                      problem_description=train_problem,
                      context=Context.TESTING)

    # Fitting pipeline on input dataset.
    fit_results = runtime.fit(inputs=[train_dataset])
    fit_results.check_success()

    # Producing results using the fitted pipeline.
    test_dataset = load_dataset(root_path, 'TEST')
    produce_results = runtime.produce(inputs=[test_dataset])
    produce_results.check_success()

    print('Pipeline run successfully')
    output = list(produce_results.values.values())[0]
    print(output.shape)
    print(output.head())
Beispiel #4
0
 def __run_pipeline(self,
                    pipeline_description,
                    data,
                    volume_dir='/volumes'):
     runtime = Runtime(pipeline=pipeline_description,
                       context=metadata_base.Context.TESTING,
                       volumes_dir=volume_dir)
     fit_result = runtime.fit([data])
     return fit_result
    def fitproduce(self, input_item):
        problem_doc, pipeline_json, dataset_train, dataset_test = input_item[
            1:]

        # Run pipeline
        pipeline = Pipeline.from_json(pipeline_json)
        pipeline_runtime = Runtime(pipeline, context=Context.TESTING)
        pipeline_runtime.fit(inputs=[dataset_train],
                             return_values=['outputs.0'])
        score_predictions = pipeline_runtime.produce(
            inputs=[dataset_test], return_values=['outputs.0'])
        score_predictions = score_predictions.values['outputs.0']

        # Write predictions to output path
        path = self.get_predictions_save_path()
        utils.utils.write_predictions_to_file(score_predictions, path,
                                              problem_doc)
        path_uri = "file://%s" % path
        return path_uri
    def score(self, input_item):
        problem_doc, metric, pipeline_json, dataset_train, dataset_test = input_item[
            1:]

        # Run pipeline
        pipeline = Pipeline.from_json(pipeline_json)
        pipeline_runtime = Runtime(pipeline, context=Context.TESTING)
        pipeline_runtime.fit(inputs=[dataset_train],
                             return_values=['outputs.0'])
        score_predictions = pipeline_runtime.produce(
            inputs=[dataset_test], return_values=['outputs.0'])
        score_predictions = score_predictions.values['outputs.0']

        # Evaluate scores on score dir
        achieved_score = utils.train_utils.score(score_predictions,
                                                 dataset_test,
                                                 problem_doc,
                                                 override_metric_key=metric)
        return achieved_score
Beispiel #7
0
def score_pipeline(dataset,
                   problem,
                   pipeline_path,
                   static=None,
                   output_path=None):
    pipeline = load_pipeline(pipeline_path)

    # Creating an instance on runtime with pipeline description and problem description.
    runtime = Runtime(
        pipeline=pipeline,
        problem_description=problem,
        context=Context.EVALUATION,
        volumes_dir=static,
    )

    LOGGER.info("Fitting pipeline %s", pipeline_path)
    fit_results = runtime.fit(inputs=[dataset])
    fit_results.check_success()

    dataset_doc_path = dataset.metadata.query(())['location_uris'][0]
    dataset_root = dataset_doc_path[:-len(
        '/TRAIN/dataset_TRAIN/datasetDoc.json')]
    test_dataset = load_dataset(dataset_root, 'SCORE', 'TEST')

    # Producing results using the fitted pipeline.
    LOGGER.info("Producing predictions for pipeline %s", pipeline_path)
    produce_results = runtime.produce(inputs=[test_dataset])
    produce_results.check_success()

    predictions = produce_results.values['outputs.0']
    metrics = problem['problem']['performance_metrics']

    LOGGER.info("Computing the score for pipeline %s", pipeline_path)
    scoring_pipeline = load_pipeline(DEFAULT_SCORING_PIPELINE_PATH)
    scores, scoring_pipeline_run = score(
        scoring_pipeline=scoring_pipeline,
        problem_description=problem,
        predictions=predictions,
        score_inputs=[test_dataset],
        metrics=metrics,
        context=Context.EVALUATION,
        random_seed=0,
    )

    evaluated_pipeline_run = produce_results.pipeline_run
    evaluated_pipeline_run.is_standard_pipeline = True
    evaluated_pipeline_run.set_scores(scores, metrics)
    evaluated_pipeline_run.set_scoring_pipeline_run(
        scoring_pipeline_run.pipeline_run, [dataset])

    _to_yaml_run(evaluated_pipeline_run, output_path)

    return scores.iloc[0].value
Beispiel #8
0
import utils.train_utils

if __name__ == "__main__":

    # Get args
    try:
        path_to_pipeline_json = sys.argv[1]
        inputdir = sys.argv[2]

        # Load datasets
        problem_doc, dataset = utils.utils.load_data_from_dir(inputdir)

        # Create pipeline
        with open(path_to_pipeline_json, "r") as f:
            pipeline = Pipeline.from_json(f.read())

        pipeline_runtime = Runtime(pipeline, context=Context.TESTING)
        pipeline_runtime.fit(inputs=[dataset], return_values=['outputs.0'])

        problem_doc_score, dataset_score = utils.utils.load_data_from_dir(inputdir, mode="score")

        score_predictions = pipeline_runtime.produce(inputs=[dataset_score], return_values=['outputs.0'])
        score_predictions = score_predictions.values['outputs.0']

        # Evaluate scores on score dir
        achieved_score = utils.train_utils.score(score_predictions, dataset_score, problem_doc_score)
        print(achieved_score)
    except:
        print("N/A")

 def write_pipeline_run(self, problem_description, dataset, filename_yaml): 
     runtime = Runtime(pipeline=self.pipeline_description, problem_description=problem_description, context=Context.TESTING, is_standard_pipeline=True)
     output = runtime.fit(inputs=dataset)
     pipeline_run = output.pipeline_run
     with open(filename_yaml, "w") as out:
         pipeline_run.to_yaml(file=filename_yaml)
    outputdir = sys.argv[3]

    ################
    # Load dataset #
    ################
    problem_doc_train, dataset_train = (
        utils.utils.load_data_from_dir(inputdir, mode="train"))

    ###################
    # Create pipeline #
    ###################
    with open(path_to_pipeline_json, "r") as f:
        pipeline = utils.primitive_pipeline_utils.load_pipeline(f.read())

    pipeline_runtime = Runtime(pipeline)
    pipeline_runtime.fit(inputs=[dataset_train])

    ############################
    # Try scoring on SCORE set #
    ############################
    try:
        problem_doc_score, dataset_score = utils.utils.load_data_from_dir(inputdir, mode="score")
        score_predictions = pipeline_runtime.produce(inputs=[dataset_score]).values['outputs.0']
        validation_score = utils.train_utils.get_score_on_score_set(inputdir, problem_doc_score, score_predictions)
        print("PipelineId: %s, Score: %s" % (pipeline.id, validation_score))
    except:
        print("-------------------------------")
        print("Failed to evaluate on SCORE set")
        print("-------------------------------")
        traceback.print_exc()
Beispiel #11
0
# Loading problem description.
problem_description = problem.parse_problem_description(problem_path)

# Loading dataset.
path = 'file://{uri}'.format(uri=os.path.abspath(dataset_train_path))
dataset = D3MDatasetLoader().load(dataset_uri=path)

path2 = 'file://{uri}'.format(uri=os.path.abspath(dataset_predict_path))
dataset_predict = D3MDatasetLoader().load(dataset_uri=path2)

# Loading pipeline description file.
with open(pipeline_path, 'r') as file:
    pipeline_description = pipeline_module.Pipeline.from_json(
        string_or_file=file)

# Creating an instance on runtime with pipeline description and problem description.
runtime = Runtime(pipeline=pipeline_description,
                  problem_description=problem_description,
                  context=metadata_base.Context.TESTING)

# Fitting pipeline on input dataset.
fit_results = runtime.fit(inputs=[dataset])
fit_results.check_success()

# Producing results using the fitted pipeline.
produce_results = runtime.produce(inputs=[dataset_predict])
produce_results.check_success()

print(produce_results.values)