Beispiel #1
0
    def do_score(self, solution_id, dataset_path, problem_path, ta2_id):
        try:
            problem = Problem.load(problem_uri=problem_path)
        except:
            logger.exception('Error parsing problem')

        # Encode metric
        metrics = []
        for metric in problem['problem']['performance_metrics']:
            metrics.append(encode_performance_metric(metric))

        # Showing only the first metric
        target_metric = problem['problem']['performance_metrics'][0]['metric']
        logger.info('target_metric %s !', target_metric)

        response = self.core.ScoreSolution(
            pb_core.ScoreSolutionRequest(
                solution_id=solution_id,
                inputs=[
                    pb_value.Value(dataset_uri='file://%s' % dataset_path, )
                ],
                performance_metrics=metrics,
                users=[],
                configuration=pb_core.ScoringConfiguration(
                    method='HOLDOUT',
                    train_test_ratio=0.75,
                    shuffle=True,
                    random_seed=0),
            ))
        logger.info('ScoreSolution response %s !', response)

        # Get Results
        results = self.core.GetScoreSolutionResults(
            pb_core.GetScoreSolutionResultsRequest(
                request_id=response.request_id, ))
        for result in results:
            logger.info('result %s !', result)
            if result.progress.state == pb_core.COMPLETED:
                scores = []
                for metric_score in result.scores:
                    metric = decode_performance_metric(
                        metric_score.metric)['metric']
                    if metric == target_metric:
                        score = decode_value(metric_score.value)['value']
                        scores.append(score)
                if len(scores) > 0:
                    avg_score = round(sum(scores) / len(scores), 5)
                    normalized_score = PerformanceMetric[
                        target_metric.name].normalize(avg_score)

                    return {
                        'score': avg_score,
                        'normalized_score': normalized_score,
                        'metric': target_metric.name.lower()
                    }
Beispiel #2
0
def generate_problem_description(dataset, task=None, *, task_keywords=None, performance_metrics=None):
    """
    A function that simplifies the generation of a problem description.

    Parameters
    ----------
    dataset : Dataset
        Dataset to be use for pipeline search.
    task : str
        A string that represent the problem type, currently only supported: ``binary_classification`` and
        ``regression``.
    task_keywords : List[TaskKeyword]
        A list of TaskKeyword.
    performance_metrics: List[PerformanceMetric]
        A list of PerformanceMetric.

    Returns
    -------
    A Problem
    """
    dataset_id = dataset.metadata.query(())['id']
    problem_id = dataset_id + '_problem'
    schema = 'https://metadata.datadrivendiscovery.org/schemas/v0/problem.json'
    version = '4.0.0'

    target_column_index = None

    for i in range(dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS,))['dimension']['length']):
        if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in \
                dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i,))['semantic_types']:
            target_column_index = i
            break

    if target_column_index is None:
        raise ValueError('Input dataframe does not contains targets')

    inputs = {
        'dataset_id': dataset_id,
        'targets': [{
            'column_index': target_column_index,
            'column_name': dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i,))['name'],
            'resource_id': 'learningData',
            'target_index': 0
        }]
    }

    problem = None
    if task is None:
        if performance_metrics is not None and task_keywords is not None:
            problem = {
                'performance_metrics': performance_metrics,
                'task_keywords': task_keywords
            }
    else:
        if task in PROBLEM_DEFINITION:
            problem = PROBLEM_DEFINITION[task]
        else:
            raise ValueError(task + """ task is not supported in default definitions. 
            You can define your own task by specifying the task_keywords and performance metrics.""")

    problem_description = {
        'id': problem_id,
        'schema': schema,
        'version': version,
        'inputs': [inputs],
        'problem': problem
    }

    return Problem(problem_description)
Beispiel #3
0
 def _build_problem(self, dataset):
     problem = Problem.load(problem_uri=self._get_problem_doc_path(dataset))
     return encode_problem_description(problem)
Beispiel #4
0
 def _build_problem(self, problem_path):
     problem = Problem.load(problem_uri=problem_path)
     return encode_problem_description(problem)
Beispiel #5
0
def search_pipelines(datasets, time_bound=10, use_template=False):
    search_results_path = join(D3MOUTPUTDIR, 'temp', 'search_results.json')
    search_results = load_search_results(search_results_path)
    channel = grpc.insecure_channel('localhost:45042')
    core = LoggingStub(pb_core_grpc.CoreStub(channel), logger)
    size = len(datasets)
    pipeline_template = None

    if use_template:
        pipeline_template = load_template()

    for i, dataset in enumerate(datasets):
        logger.info('Processing dataset "%s" (%d/%d)' % (dataset, i + 1, size))
        start_time = datetime.now()

        dataset_train_path = join(D3MINPUTDIR, dataset,
                                  'TRAIN/dataset_TRAIN/datasetDoc.json')
        problem_path = join(D3MINPUTDIR, dataset,
                            'TRAIN/problem_TRAIN/problemDoc.json')

        if not os.path.isfile(problem_path):
            logger.error('Problem file (%s) doesnt exist', problem_path)
            continue

        try:
            problem = Problem.load(problem_uri=fix_uri(problem_path))
        except:
            logger.exception('Error parsing problem')
            continue

        task_keywords = '_'.join(
            [x.name for x in problem['problem']['task_keywords']])
        search_id, pipelines = do_search(core,
                                         problem,
                                         dataset_train_path,
                                         time_bound=time_bound,
                                         pipelines_limit=0,
                                         pipeline_template=pipeline_template)
        #print(dataset, problem['problem']['performance_metrics'][0]['metric'].name, task_keywords)
        number_pipelines = len(pipelines)
        result = {
            'search_id': search_id,
            'task': task_keywords,
            'search_time': str(datetime.now() - start_time),
            'pipelines': number_pipelines,
            'best_time': 'None',
            'best_score': 'None',
            'all_scores': []
        }

        if number_pipelines > 0:
            best_time = sorted(pipelines.values(), key=lambda x: x[2])[0][2]
            sorted_pipelines = sorted(pipelines.items(),
                                      key=lambda x: x[1][0],
                                      reverse=True)
            all_scores = []

            for pipeline_id, (_, pipeline, _) in sorted_pipelines:
                if use_template:  # FIXME: Pipeline score is not calculate when working with fully defined pipeline
                    pipeline_score = 1.0
                else:
                    pipeline_score = decode_value(
                        pipeline[0].scores[0].value)['value']
                all_scores.append({'id': pipeline_id, 'score': pipeline_score})
                #do_score(core, problem, [pipeline_id], dataset_train_path)
                #fitted_pipeline = do_train(core, [pipeline_id], dataset_train_path)
                #do_save_fitted_solution(core, fitted_pipeline)
                #do_test(core, fitted_pipeline, dataset_train_path.replace('TRAIN', 'TEST'))
                #do_export(core, fitted_pipeline)
                #do_describe(core, [pipeline_id])

            result['pipelines'] = number_pipelines
            result['best_time'] = best_time
            result['best_score'] = all_scores[0]['score']
            result['all_scores'] = all_scores

        search_results[dataset] = result

        with open(search_results_path, 'w') as fout:
            json.dump(search_results, fout, indent=4)
Beispiel #6
0
def load_problem(root_path, phase):
    path = os.path.join(root_path, phase, 'problem_' + phase, 'problemDoc.json')
    return Problem.load(problem_uri='file://' + os.path.abspath(path))
Beispiel #7
0
    # pp.fit_and_produce()
    # pp.save()
    data_dir = "/Users/muxin/Desktop/ISI/dsbox-env/output/seed/38_sick/"
    log_dir = '/Users/muxin/Desktop/studies/master/2018Summer/data/log'
    pids = [
        '32b24d72-44c6-4956-bc21-835cb42f0f2e',
        'a8f4001a-64f4-4ff1-a89d-3548f4dfeb88',
        '5e1d9723-ec02-46d2-abdf-46389fba8e52'
    ]
    dataset = container.Dataset.load(
        'file:///Users/muxin/Desktop/ISI/dsbox-env/data/datasets/seed_datasets_current/38_sick/38_sick_dataset/datasetDoc.json'
    )
    set_target_column(dataset)
    problem_doc_path = os.path.abspath(
        "/Users/muxin/Desktop/ISI/dsbox-env/data/datasets/seed_datasets_current/38_sick/38_sick_problem/problemDoc.json"
    )
    problem = Problem.load('file://' + problem_doc_path)
    with open(problem_doc_path) as file:
        problem_doc = json.load(file)
    qq = HorizontalTuningPipeline(pipeline_files_dir=data_dir,
                                  pids=None,
                                  problem=problem,
                                  train_dataset=dataset,
                                  test_dataset=dataset)
    qq.generate_candidate_pids()
    print(qq.pids)
    qq.generate_ensemble_pipeline()
    qq.fit_and_produce()
    print(qq.fitted_pipeline.get_produce_step_output(0))
    qq.save()
Beispiel #8
0
 def _load_problem(self):
     if self.problem_schema == '':
         return
     self.problem = Problem.load('file://' +
                                 os.path.abspath(self.problem_schema))
     self._load_problem_rest()
Beispiel #9
0
def load_problem(root_path, phase):
    path = os.path.join(root_path, phase, 'problem_' + phase,
                        'problemDoc.json')
    return Problem.load(problem_uri=path)