Ejemplo n.º 1
0
def _load_pipeline(pipeline, hyperparams=None):
    if isinstance(pipeline, str) and os.path.isfile(pipeline):
        pipeline = MLPipeline.load(pipeline)
    else:
        pipeline = MLPipeline(pipeline)

    if hyperparams is not None:
        pipeline.set_hyperparameters(hyperparams)

    return pipeline
Ejemplo n.º 2
0
def _load_pipeline(pipeline):

    if isinstance(pipeline, MLPipeline):
        return pipeline

    if isinstance(pipeline, str):
        return MLPipeline.load(pipeline)

    if isinstance(pipeline, dict):
        return MLPipeline.from_dict(pipeline)

    raise ValueError('Invalid pipeline %s', pipeline)
Ejemplo n.º 3
0
def evaluate_task(task,
                  metrics=None,
                  feature_matrix=None,
                  output_path=None,
                  save_intermedia_data=True,
                  save_model=True,
                  save_hyperparameters=True):
    """Run benchmark testing on a task. Save intermedia data, trained models, and optimized
    hyperparameters. Return testing results.

    Args:
        task (Task):
            a task instance storing meta information of the task.
        metrics (list)
            a list of strings to identify the metric functions.
        feature_matrix (pd.DataFrame):
            a dataframe consists of both feature values and target values.
        output_path (str):
            a directory path to store the intermedia data, model and hyperparametes.
        save_intermedia_data (boolean):
            whether to store the intermedia data including an entity set and a feature matrix if
            the beginning stage is "data_loader" or "problem_definition".
        save_model (boolean):
            whether to store the trained model.
        save_hyperparameters (boolean):
            whether to store the hyperparameters if task.tuned is true.

    Returns:
        list:
            benchmarking results of each run.
    """
    # Load pipeline.
    pipeline = MLPipeline.load(os.path.join(ROOT_DIR, task.path_to_pipeline))

    # Set hyperparameters.
    if task.path_to_hyperparameters is not None:
        _, extension = os.path.splitext(task.path_to_hyperparameters)
        with open(os.path.join(ROOT_DIR, task.path_to_hyperparameters)) as f:
            if extension == '.json':
                init_hyperparameters = json.load(f)
            elif extension == '.pkl':
                init_hyperparameters = pickle.load(f)
            else:
                raise TypeError("Unsupported file type {}.".format(extension))
        pipeline.set_hyperparameters(init_hyperparameters)

    # Load Dataset.
    if feature_matrix is None:
        if task.beginning_stage == "data_loader":
            raise NotImplementedError

        elif task.beginning_stage == "problem_definition":
            raise NotImplementedError

        elif task.beginning_stage == "featurization":
            feature_matrix = pd.read_csv(os.path.join(ROOT_DIR,
                                                      task.path_to_dataset),
                                         index_col=0)

    else:
        raise ValueError("Beginning stage should be either \"data_loader\", "
                         "\"problem_definition\" or \"featurization\".")

    # Run the pipeline for #task.run_num times and record each run.
    results = []
    records = []
    for i in range(task.run_num):
        scores, model, hyperparameters = _evaluate_pipeline(
            i, pipeline, feature_matrix, task.pipeline_name, task.problem_name,
            task.dataset_name, task.beginning_stage, task.tuned, metrics)
        results.append(scores)
        records.append((model, hyperparameters))

    # Store the output results.
    if output_path is not None:
        # Initialize the output directory.
        if os.path.exists(output_path):
            shutil.rmtree(output_path)
        os.mkdir(output_path)

        # Save task meta information
        task.save_as(os.path.join(output_path, "meta.json"))
        matrix = 'F1 Macro'
        best_index = np.argmax([scores[matrix] for scores in results])
        models, hyperparameters = records[best_index]

        # Save pipeline models if required
        if save_model:
            with open(os.path.join(output_path, "model.pkl"), 'wb') as f:
                pickle.dump(models, f)

        # Save pipeline hyperparameters if required
        if save_hyperparameters and hyperparameters is not None:
            with open(os.path.join(output_path, "hyperparameters.pkl"),
                      'wb') as f:
                pickle.dump(hyperparameters, f)

    return results