Esempio n. 1
0
    def SplitData(self, request, context):
        input_data = [load_data(utils.decode_value(x)) for x in request.inputs]
        scoring_configuration = decode_scoring_configuration(
            request.scoring_configuration)
        problem_description = utils.decode_problem_description(request.problem)
        data_pipeline = schemas_utils.get_splitting_pipeline(
            scoring_configuration['method'])

        data_random_seed = 0
        outputs, data_result = runtime_module.prepare_data(
            data_pipeline=data_pipeline,
            problem_description=problem_description,
            inputs=input_data,
            data_params=scoring_configuration,
            context=Context.TESTING,
            random_seed=data_random_seed,
            volumes_dir=EnvVars.D3MSTATICDIR,
            scratch_dir=Path.TEMP_STORAGE_ROOT,
            runtime_environment=None,
        )

        if data_result.has_error():
            logger.info('method=SplitData, error={}', data_result.error)
            response = core_pb2.SplitDataResponse()
            yield response
            return
        else:
            for i, (train_output, test_output,
                    score_output) in enumerate(zip(*outputs)):
                uri_list = []
                for output, tag in (
                    (train_output, 'train'),
                    (test_output, 'test'),
                    (score_output, 'score'),
                ):
                    path = os.path.join(Path.TEMP_STORAGE_ROOT,
                                        '{}_output_{}'.format(tag, i),
                                        'datasetDoc.json')
                    uri = get_uri(path)
                    output.save(uri)
                    uri_list.append(uri)
                # response
                response = core_pb2.SplitDataResponse(
                    train_output=value_pb2.Value(dataset_uri=uri_list[0]),
                    test_output=value_pb2.Value(dataset_uri=uri_list[1]),
                    score_output=value_pb2.Value(dataset_uri=uri_list[2]),
                )
                yield response
Esempio n. 2
0
    def do_test(self, fitted_solution_id, dataset_path):
        tested = None
        try:
            response = self.core.ProduceSolution(
                pb_core.ProduceSolutionRequest(
                    fitted_solution_id=fitted_solution_id,
                    inputs=[
                        pb_value.Value(dataset_uri='file://%s' %
                                       dataset_path, )
                    ],
                    expose_outputs=['outputs.0'],
                    expose_value_types=['CSV_URI'],
                    users=[],
                ))
            # Results
            results = self.core.GetProduceSolutionResults(
                pb_core.GetProduceSolutionResultsRequest(
                    request_id=response.request_id, ))
            for result in results:
                if result.progress.state == pb_core.COMPLETED:
                    tested = result.exposed_outputs['outputs.0'].csv_uri
        except:
            logger.exception("Exception testing %r", fitted_solution_id)

        return tested
Esempio n. 3
0
def score_solution_request(solution_id, test_paths):
    inputs = [value_pb2.Value(dataset_uri=test_paths['SCORE']['dataset'])]

    problem = problem_module.Problem.load(test_paths['SCORE']['problem'])
    performance_metrics = []
    for performance_metric in problem['problem'].get('performance_metrics',
                                                     []):
        performance_metrics.append(
            utils.encode_performance_metric(performance_metric))

    # TODO add support for more evaluation methods
    users = []
    evaluation_method = 'K_FOLD'
    configuration = core_pb2.ScoringConfiguration(
        method=evaluation_method,
        folds=2,
        # train_test_ratio
        shuffle=True,
        random_seed=42,
        stratified=True,
    )
    request = core_pb2.ScoreSolutionRequest(
        solution_id=solution_id,
        inputs=inputs,
        performance_metrics=performance_metrics,
        users=users,
        configuration=configuration)
    return request
Esempio n. 4
0
    def do_score(self, solution_id, dataset_path, problem_path, ta2_id):
        try:
            problem = Problem.load(problem_uri=problem_path)
        except:
            logger.exception('Error parsing problem')

        # Encode metric
        metrics = []
        for metric in problem['problem']['performance_metrics']:
            metrics.append(encode_performance_metric(metric))

        # Showing only the first metric
        target_metric = problem['problem']['performance_metrics'][0]['metric']
        logger.info('target_metric %s !', target_metric)

        response = self.core.ScoreSolution(
            pb_core.ScoreSolutionRequest(
                solution_id=solution_id,
                inputs=[
                    pb_value.Value(dataset_uri='file://%s' % dataset_path, )
                ],
                performance_metrics=metrics,
                users=[],
                configuration=pb_core.ScoringConfiguration(
                    method='HOLDOUT',
                    train_test_ratio=0.75,
                    shuffle=True,
                    random_seed=0),
            ))
        logger.info('ScoreSolution response %s !', response)

        # Get Results
        results = self.core.GetScoreSolutionResults(
            pb_core.GetScoreSolutionResultsRequest(
                request_id=response.request_id, ))
        for result in results:
            logger.info('result %s !', result)
            if result.progress.state == pb_core.COMPLETED:
                scores = []
                for metric_score in result.scores:
                    metric = decode_performance_metric(
                        metric_score.metric)['metric']
                    if metric == target_metric:
                        score = decode_value(metric_score.value)['value']
                        scores.append(score)
                if len(scores) > 0:
                    avg_score = round(sum(scores) / len(scores), 5)
                    normalized_score = PerformanceMetric[
                        target_metric.name].normalize(avg_score)

                    return {
                        'score': avg_score,
                        'normalized_score': normalized_score,
                        'metric': target_metric.name.lower()
                    }
Esempio n. 5
0
def fit_solution_request(solution_id, test_paths):
    inputs = [value_pb2.Value(dataset_uri=test_paths['TRAIN']['dataset'])]
    expose_outputs = ['outputs.0']
    expose_value_types = ['CSV_URI']
    users = [
        core_pb2.SolutionRunUser(id='test_user',
                                 chosen=True,
                                 reason='just because')
    ]
    request = core_pb2.FitSolutionRequest(
        solution_id=solution_id,
        inputs=inputs,
        expose_outputs=expose_outputs,
        expose_value_types=expose_value_types,
        users=users)
    return request
Esempio n. 6
0
    def do_train(self, solution_id, dataset_path):
        fitted_solution = None
        try:
            response = self.core.FitSolution(
                pb_core.FitSolutionRequest(
                    solution_id=solution_id,
                    inputs=[pb_value.Value(dataset_uri=dataset_path, )],
                    expose_outputs=[],
                    expose_value_types=['CSV_URI'],
                    users=[self.user],
                ))
            # Results
            results = self.core.GetFitSolutionResults(
                pb_core.GetFitSolutionResultsRequest(
                    request_id=response.request_id, ))
            for result in results:
                if result.progress.state == pb_core.COMPLETED:
                    fitted_solution = result.fitted_solution_id
        except:
            logger.exception("Exception training %r", solution_id)

        return fitted_solution
Esempio n. 7
0
def search_solutions_request(test_paths, specified_template=None):
    user_agent = "test_agent"
    version = core_pb2.DESCRIPTOR.GetOptions().Extensions[
        core_pb2.protocol_version]

    time_bound = 0.5
    priority = 10
    # allowed_value_types = [value_pb2.ValueType.Value(value) for value in ALLOWED_VALUE_TYPES]

    problem_description = utils.encode_problem_description(
        problem_module.Problem.load(test_paths['TRAIN']['problem']))

    template = None
    if specified_template == 'FULL':
        with d3m_utils.silence():
            pipeline = pipeline_utils.load_pipeline(
                FULL_SPECIFIED_PIPELINE_PATH)
        template = utils.encode_pipeline_description(
            pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT)
    elif specified_template == 'PRE':  # PRE for PREPROCESSING
        pipeline = runtime_module.get_pipeline(PRE_SPECIFIED_PIPELINE_PATH,
                                               load_all_primitives=False)
        template = utils.encode_pipeline_description(
            pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT)

    inputs = [value_pb2.Value(dataset_uri=test_paths['TRAIN']['dataset'])]

    request = core_pb2.SearchSolutionsRequest(
        user_agent=user_agent,
        version=version,
        time_bound_search=time_bound,
        priority=priority,
        allowed_value_types=ALLOWED_VALUE_TYPES,
        problem=problem_description,
        template=template,
        inputs=inputs)
    return request