コード例 #1
0
ファイル: splitter.py プロジェクト: byu-dml/dsbox-primitives
    def _split_row(self, input_dataset):
        """
            Inner function to sample part of the row of the input dataset
            adapted from d3m's common-primitives

        Returns
        -------
        Dataset
            The sampled Dataset
        """
        row_length = input_dataset[self._main_resource_id].shape[0]
        all_indexes_list = range(1, row_length)
        sample_indices = random.sample(all_indexes_list,
                                       self._threshold_row_length)
        # We store row as sets, but later on we sort them when we cut DataFrames.
        row_indices_to_keep_sets: typing.Dict[
            str, typing.Set[int]] = collections.defaultdict(set)
        row_indices_to_keep_sets[self._main_resource_id] = set(sample_indices)

        # We sort indices to get deterministic outputs from sets (which do not have deterministic order).
        self._row_remained = {
            resource_id: sorted(indices)
            for resource_id, indices in row_indices_to_keep_sets.items()
        }
        output_dataset = Dataset.select_rows(input_dataset, self._row_remained)

        return output_dataset
コード例 #2
0
    def _get_sample_uri(self, dataset_uri, problem):
        logger.info('About to sample dataset %s', dataset_uri)
        task_keywords = problem['problem']['task_keywords']

        if any(tk in [TaskKeyword.OBJECT_DETECTION, TaskKeyword.FORECASTING] for tk in task_keywords):
            logger.info('Not doing sampling for task %s', '_'.join([x.name for x in task_keywords]))
            return None

        dataset = Dataset.load(dataset_uri)

        if is_collection(dataset_uri[7:]):
            logger.info('Not doing sampling for collections')
            return None

        dataset_sample_folder = 'file://%s/temp/dataset_sample/' % os.environ.get('D3MOUTPUTDIR')
        dataset_sample_uri = None

        if os.path.exists(dataset_sample_folder[6:]):
            shutil.rmtree(dataset_sample_folder[6:])

        dataset_sample = get_dataset_sample(dataset, problem, dataset_sample_folder)

        if isinstance(dataset_sample, str):  # Was the dataset sampled?
            dataset_sample_uri = dataset_sample

        return dataset_sample_uri
コード例 #3
0
 def _find_timeseries_metadata(
     cls, dataset: container.Dataset
 ) -> typing.Optional[metadata_base.DataMetadata]:
     # loop over the dataset to find the resource that contains the timeseries file col info
     for resource_id, resource in dataset.items():
         metadata = dataset.metadata.query((resource_id, "ALL_ELEMENTS", 0))
         if "file_columns" in metadata:
             return metadata
     return None
コード例 #4
0
def execute(pipeline_id, dataset, problem, results_path, msg_queue, db):
    # Get pipeline from database

    pipeline = (db.query(
        database.Pipeline).filter(database.Pipeline.id == pipeline_id).options(
            joinedload(database.Pipeline.modules),
            joinedload(database.Pipeline.connections))).one()

    logger.info('About to execute pipeline, id=%s, dataset=%r', pipeline_id,
                dataset)

    # Load data
    dataset = Dataset.load(dataset)
    logger.info('Loaded dataset')

    json_pipeline = convert.to_d3m_json(pipeline)
    logger.info(
        'Pipeline to be executed:\n%s', '\n'.join(
            [x['primitive']['python_path'] for x in json_pipeline['steps']]))

    d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure(
        json_pipeline, )

    runtime = d3m.runtime.Runtime(pipeline=d3m_pipeline,
                                  problem_description=problem,
                                  context=metadata_base.Context.TESTING)

    manager = Manager()
    return_dict = manager.dict()
    p = Process(target=worker, args=(runtime, dataset, return_dict))
    p.start()
    p.join(180)  # Maximum 3 minutes
    fit_results = return_dict['fit_results']
    fit_results.check_success()

    if results_path is not None:
        logger.info('Storing fit results at %s', results_path)
        fit_results.values['outputs.0'].to_csv(results_path)
    else:
        logger.info('NOT storing fit results')

    return fit_results.values
コード例 #5
0
def test(pipeline_id, dataset, storage_dir, steps_to_expose, msg_queue, db):
    dataset = Dataset.load(dataset)
    logger.info('Loaded dataset')

    runtime = None
    with open(
            os.path.join(storage_dir, 'fitted_solution_%s.pkl' % pipeline_id),
            'rb') as fin:
        runtime = pickle.load(fin)

    results = runtime.produce(inputs=[dataset],
                              outputs_to_expose=steps_to_expose)
    results.check_success()

    logger.info('Storing produce results at %s', storage_dir)
    for step_id in results.values:
        if step_id in steps_to_expose and isinstance(results.values[step_id],
                                                     DataFrame):
            results.values[step_id].to_csv(
                join(storage_dir,
                     'produce_%s_%s.csv' % (pipeline_id, step_id)))
コード例 #6
0
def train(pipeline_id, dataset, problem, storage_dir, steps_to_expose, msg_queue, db):
    # Get pipeline from database
    pipeline = (
        db.query(database.Pipeline)
            .filter(database.Pipeline.id == pipeline_id)
            .options(joinedload(database.Pipeline.modules),
                     joinedload(database.Pipeline.connections))
    ).one()

    logger.info('About to train pipeline, id=%s, dataset=%r',
                pipeline_id, dataset)

    # Load data
    dataset = Dataset.load(dataset)
    logger.info('Loaded dataset')

    # Training step - fit pipeline on training data
    logger.info('Running training')

    d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure(
        convert.to_d3m_json(pipeline),
    )

    expose_outputs = True if len(steps_to_expose) > 0 else False

    fitted_pipeline, predictions, results = d3m.runtime.fit(d3m_pipeline, [dataset], problem_description=problem,
                                                            context=metadata_base.Context.TESTING,
                                                            volumes_dir=os.environ.get('D3MSTATICDIR', None),
                                                            random_seed=0,
                                                            expose_produced_outputs=expose_outputs)

    results.check_success()

    logger.info('Storing fit results at %s', storage_dir)
    for step_id in results.values:
        if step_id in steps_to_expose and isinstance(results.values[step_id], DataFrame):
            results.values[step_id].to_csv(join(storage_dir, 'fit_%s_%s.csv' % (pipeline_id, step_id)))

    with open(join(storage_dir, 'fitted_solution_%s.pkl' % pipeline_id), 'wb') as fout:
        pickle.dump(fitted_pipeline, fout)
コード例 #7
0
def sample_rows(
    dataset: container.Dataset,
    main_resource_id: str,
    main_resource_indices_to_keep: typing.Set[int],
    relations_graph: typing.Dict[str,
                                 typing.List[typing.Tuple[str, bool, int, int,
                                                          typing.Dict]]],
    *,
    delete_recursive: bool = False,
) -> container.Dataset:
    # We store rows as sets, but later on we sort them when we select rows.
    row_indices_to_keep_sets: typing.Dict[
        str, typing.Set[int]] = collections.defaultdict(set)
    row_indices_to_keep_sets[main_resource_id] = main_resource_indices_to_keep

    # If "delete_recursive" is set to "False", we do not populate "row_indices_to_keep_sets"
    # with other resources, making "select_rows" simply keep them.
    if delete_recursive:
        # We sort to be deterministic.
        for main_resource_row_index in sorted(
                row_indices_to_keep_sets[main_resource_id]):
            queue = []
            queue.append((main_resource_id, [main_resource_row_index]))
            while queue:
                current_resource_id, current_row_indices = queue.pop(0)
                current_resource = dataset[current_resource_id]

                for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in relations_graph[
                        current_resource_id]:
                    # All rows from the main resource we want are already there.
                    # TODO: What to do if we get a reference to the row in the main resource which is not part of this sample?
                    #       This means that probably the sample is invalid. We should not be generating such samples which do not
                    #       preserve reference loops and their consistency. Otherwise it is not really possible to denormalize
                    #       such Dataset properly: a reference is referencing a row in the main resource which does not exist.
                    if edge_resource_id == main_resource_id:
                        continue

                    edge_resource = dataset[edge_resource_id]

                    to_column_values = edge_resource.iloc[:, edge_to_index]
                    for from_column_value in current_resource.iloc[
                            current_row_indices, edge_from_index]:
                        # We assume here that "index" corresponds to the default index with row indices.
                        rows_with_value = edge_resource.index[
                            to_column_values == from_column_value]
                        # We sort to be deterministic.
                        new_rows_list = sorted(
                            set(rows_with_value) -
                            row_indices_to_keep_sets[edge_resource_id])
                        row_indices_to_keep_sets[edge_resource_id].update(
                            new_rows_list)
                        queue.append((edge_resource_id, new_rows_list))

    # We sort indices to get deterministic outputs from sets (which do not have deterministic order).
    # We also do not want to change the row order but keep the original row order.
    # Sorting by row indices values assure that.
    row_indices_to_keep = {
        resource_id: sorted(indices)
        for resource_id, indices in row_indices_to_keep_sets.items()
    }

    return dataset.select_rows(row_indices_to_keep)
コード例 #8
0
def get_tabular_resource(
    dataset: container.Dataset,
    resource_id: typing.Optional[str],
    *,
    pick_entry_point: bool = True,
    pick_one: bool = True,
    has_hyperparameter: bool = True,
) -> typing.Tuple[str, container.DataFrame]:
    if resource_id is None and pick_entry_point:
        for dataset_resource_id in dataset.keys():
            if dataset.metadata.has_semantic_type((
                    dataset_resource_id,
            ), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'
                                                  ):
                resource_id = dataset_resource_id
                break

    if resource_id is None and pick_one:
        tabular_resource_ids = [
            dataset_resource_id
            for dataset_resource_id, dataset_resource in dataset.items()
            if isinstance(dataset_resource, container.DataFrame)
        ]
        if len(tabular_resource_ids) == 1:
            resource_id = tabular_resource_ids[0]

    if resource_id is None:
        if has_hyperparameter:
            if pick_entry_point and pick_one:
                raise ValueError(
                    "A Dataset with multiple tabular resources without an entry point and no resource specified as a hyper-parameter."
                )
            elif pick_entry_point:
                raise ValueError(
                    "A Dataset without an entry point and no resource specified as a hyper-parameter."
                )
            elif pick_one:
                raise ValueError(
                    "A Dataset with multiple tabular resources and no resource specified as a hyper-parameter."
                )
            else:
                raise ValueError("No resource specified as a hyper-parameter.")
        else:
            if pick_entry_point and pick_one:
                raise ValueError(
                    "A Dataset with multiple tabular resources without an entry point."
                )
            elif pick_entry_point:
                raise ValueError("A Dataset without an entry point.")
            elif pick_one:
                raise ValueError("A Dataset with multiple tabular resources.")
            else:
                raise ValueError("No resource specified.")

    else:
        resource = dataset[resource_id]

    if not isinstance(resource, container.DataFrame):
        raise TypeError(
            "The Dataset resource '{resource_id}' is not a DataFrame, but '{type}'."
            .format(
                resource_id=resource_id,
                type=type(resource),
            ))

    return resource_id, resource
コード例 #9
0
def tune(pipeline_id, metrics, problem, dataset_uri, sample_dataset_uri, report_rank, timeout_tuning, timeout_run,
         msg_queue, db):
    timeout_tuning = timeout_tuning * 0.9  # FIXME: Save 10% of timeout to score the best config
    # Load pipeline from database
    pipeline = (
        db.query(database.Pipeline)
        .filter(database.Pipeline.id == pipeline_id)
        .options(joinedload(database.Pipeline.modules),
                 joinedload(database.Pipeline.connections))
    ).one()

    logger.info('About to tune pipeline, id=%s, dataset=%r, timeout=%d secs', pipeline_id, dataset_uri, timeout_tuning)
    tunable_primitives = {}

    for primitive in pipeline.modules:
        if is_tunable(primitive.name):
            tunable_primitives[primitive.id] = primitive.name

    if len(tunable_primitives) == 0:
        logger.info('No primitives to be tuned for pipeline %s', pipeline_id)
        sys.exit(1)

    logger.info('Tuning primitives: %s', ', '.join(tunable_primitives.values()))

    if sample_dataset_uri:
        dataset = Dataset.load(sample_dataset_uri)
    else:
        dataset = Dataset.load(dataset_uri)

    task_keywords = problem['problem']['task_keywords']
    scoring_config = {'shuffle': 'true',
                      'stratified': 'true' if TaskKeyword.CLASSIFICATION in task_keywords else 'false',
                      'method': 'K_FOLD',
                      'number_of_folds': '2'}

    metrics_to_use = deepcopy(metrics)
    if metrics[0]['metric'] == PerformanceMetric.F1 and TaskKeyword.SEMISUPERVISED in problem['problem']['task_keywords']:
        metrics_to_use = [{'metric': PerformanceMetric.F1_MACRO}]

    def evaluate_tune(hyperparameter_configuration):
        new_hyperparams = []
        for primitive_id, primitive_name in tunable_primitives.items():
            hy = get_new_hyperparameters(primitive_name, hyperparameter_configuration)
            db_hyperparams = database.PipelineParameter(
                pipeline=pipeline,
                module_id=primitive_id,
                name='hyperparams',
                value=pickle.dumps(hy),
            )
            new_hyperparams.append(db_hyperparams)

        pipeline.parameters += new_hyperparams
        scores = evaluate(pipeline, kfold_tabular_split, dataset, metrics_to_use, problem, scoring_config, dataset_uri,
                          timeout_run)
        first_metric = metrics_to_use[0]['metric'].name
        score_values = []
        for fold_scores in scores.values():
            for metric, score_value in fold_scores.items():
                if metric == first_metric:
                    score_values.append(score_value)

        avg_score = sum(score_values) / len(score_values)
        cost = 1.0 - metrics_to_use[0]['metric'].normalize(avg_score)
        logger.info('Tuning results:\n%s, cost=%s', scores, cost)

        return cost

    # Run tuning, gets best configuration
    tuning = HyperparameterTuning(tunable_primitives.values())
    create_outputfolders(join(os.environ.get('D3MOUTPUTDIR'), 'temp', 'tuning'))
    best_configuration = tuning.tune(evaluate_tune, wallclock=timeout_tuning,
                                     output_dir=join(os.environ.get('D3MOUTPUTDIR'),
                                                     'temp', 'tuning', str(pipeline_id)))

    # Duplicate pipeline in database
    new_pipeline = database.duplicate_pipeline(db, pipeline, 'HyperparameterTuning from pipeline %s' % pipeline_id)

    for primitive in new_pipeline.modules:
        if is_tunable(primitive.name):
            best_hyperparameters = get_new_hyperparameters(primitive.name, best_configuration)
            query = db.query(database.PipelineParameter).filter(database.PipelineParameter.module_id == primitive.id)\
                .filter(database.PipelineParameter.pipeline_id == new_pipeline.id)\
                .filter(database.PipelineParameter.name == 'hyperparams')
            if query.first():
                original_parameters = pickle.loads(query.first().value)
                original_parameters.update(best_hyperparameters)
                query.update({database.PipelineParameter.value: pickle.dumps(original_parameters)})
            else:
                db.add(database.PipelineParameter(
                    pipeline=new_pipeline,
                    module_id=primitive.id,
                    name='hyperparams',
                    value=pickle.dumps(best_hyperparameters),
                ))
    db.commit()

    logger.info('Tuning done, generated new pipeline %s', new_pipeline.id)

    shutil.rmtree(join(os.environ.get('D3MOUTPUTDIR'), 'temp', 'tuning', str(pipeline_id)))

    score(new_pipeline.id, dataset_uri, sample_dataset_uri, metrics, problem, scoring_config, timeout_run, report_rank, None,
          db_filename=join(os.environ.get('D3MOUTPUTDIR'), 'temp', 'db.sqlite3'))
    # TODO: Change this static string path

    msg_queue.send(('tuned_pipeline_id', new_pipeline.id))
コード例 #10
0
def score(pipeline_id, dataset_uri, sample_dataset_uri, metrics, problem,
          scoring_config, timeout_run, report_rank, msg_queue, db):
    dataset_uri_touse = dataset_uri

    if sample_dataset_uri:
        dataset_uri_touse = sample_dataset_uri
    if TaskKeyword.FORECASTING in problem['problem']['task_keywords']:
        check_timeindicator(dataset_uri_touse[7:])

    dataset = Dataset.load(dataset_uri_touse)
    # Get pipeline from database
    pipeline = (db.query(
        database.Pipeline).filter(database.Pipeline.id == pipeline_id).options(
            joinedload(database.Pipeline.modules),
            joinedload(database.Pipeline.connections))).one()

    logger.info('About to score pipeline, id=%s, metrics=%s, dataset=%r',
                pipeline_id, metrics, dataset_uri)

    scores = {}
    scores_db = []
    pipeline_split = None

    if TaskKeyword.FORECASTING in problem['problem']['task_keywords']:
        pipeline_split = kfold_timeseries_split

    elif scoring_config['method'] == 'K_FOLD':
        pipeline_split = kfold_tabular_split

    elif scoring_config['method'] == 'HOLDOUT':
        pipeline_split = train_test_tabular_split

    elif scoring_config['method'] == 'RANKING':  # For TA2 only evaluation
        scoring_config['number_of_folds'] = '4'
        report_rank = True
        pipeline_split = kfold_tabular_split
    else:
        logger.warning('Unknown evaluation method, using K_FOLD')
        pipeline_split = kfold_tabular_split

    if metrics[0][
            'metric'] == PerformanceMetric.F1 and TaskKeyword.SEMISUPERVISED in problem[
                'problem']['task_keywords']:
        new_metrics = [{'metric': PerformanceMetric.F1_MACRO}]
        scores = evaluate(pipeline, kfold_tabular_split, dataset, new_metrics,
                          problem, scoring_config, dataset_uri, timeout_run)
        scores = change_name_metric(scores,
                                    new_metrics,
                                    new_metric=metrics[0]['metric'].name)
    else:
        scores = evaluate(pipeline, pipeline_split, dataset, metrics, problem,
                          scoring_config, dataset_uri, timeout_run)

    logger.info("Evaluation results:\n%s", scores)

    if len(scores) > 0:  # It's a valid pipeline
        scores_db = add_scores_db(scores, scores_db)
        if report_rank:  # For TA2 only evaluation
            scores = create_rank_metric(scores, metrics)
            scores_db = add_scores_db(scores, scores_db)
            logger.info("Evaluation results for RANK metric: \n%s", scores)

    # TODO Should we rename CrossValidation table?
    record_db = database.CrossValidation(pipeline_id=pipeline_id,
                                         scores=scores_db)  # Store scores
    db.add(record_db)
    db.commit()
コード例 #11
0
 def __init__(self, dataset, targets=None, features=None, DBSession=None):
     self.dataset = Dataset.load(dataset)
     self.dataset_uri = dataset
     self.DBSession = DBSession
     self.targets = targets
     self.features = features