def SplitData(self, request, context): input_data = [load_data(utils.decode_value(x)) for x in request.inputs] scoring_configuration = decode_scoring_configuration( request.scoring_configuration) problem_description = utils.decode_problem_description(request.problem) data_pipeline = schemas_utils.get_splitting_pipeline( scoring_configuration['method']) data_random_seed = 0 outputs, data_result = runtime_module.prepare_data( data_pipeline=data_pipeline, problem_description=problem_description, inputs=input_data, data_params=scoring_configuration, context=Context.TESTING, random_seed=data_random_seed, volumes_dir=EnvVars.D3MSTATICDIR, scratch_dir=Path.TEMP_STORAGE_ROOT, runtime_environment=None, ) if data_result.has_error(): logger.info('method=SplitData, error={}', data_result.error) response = core_pb2.SplitDataResponse() yield response return else: for i, (train_output, test_output, score_output) in enumerate(zip(*outputs)): uri_list = [] for output, tag in ( (train_output, 'train'), (test_output, 'test'), (score_output, 'score'), ): path = os.path.join(Path.TEMP_STORAGE_ROOT, '{}_output_{}'.format(tag, i), 'datasetDoc.json') uri = get_uri(path) output.save(uri) uri_list.append(uri) # response response = core_pb2.SplitDataResponse( train_output=value_pb2.Value(dataset_uri=uri_list[0]), test_output=value_pb2.Value(dataset_uri=uri_list[1]), score_output=value_pb2.Value(dataset_uri=uri_list[2]), ) yield response
def do_test(self, fitted_solution_id, dataset_path): tested = None try: response = self.core.ProduceSolution( pb_core.ProduceSolutionRequest( fitted_solution_id=fitted_solution_id, inputs=[ pb_value.Value(dataset_uri='file://%s' % dataset_path, ) ], expose_outputs=['outputs.0'], expose_value_types=['CSV_URI'], users=[], )) # Results results = self.core.GetProduceSolutionResults( pb_core.GetProduceSolutionResultsRequest( request_id=response.request_id, )) for result in results: if result.progress.state == pb_core.COMPLETED: tested = result.exposed_outputs['outputs.0'].csv_uri except: logger.exception("Exception testing %r", fitted_solution_id) return tested
def score_solution_request(solution_id, test_paths): inputs = [value_pb2.Value(dataset_uri=test_paths['SCORE']['dataset'])] problem = problem_module.Problem.load(test_paths['SCORE']['problem']) performance_metrics = [] for performance_metric in problem['problem'].get('performance_metrics', []): performance_metrics.append( utils.encode_performance_metric(performance_metric)) # TODO add support for more evaluation methods users = [] evaluation_method = 'K_FOLD' configuration = core_pb2.ScoringConfiguration( method=evaluation_method, folds=2, # train_test_ratio shuffle=True, random_seed=42, stratified=True, ) request = core_pb2.ScoreSolutionRequest( solution_id=solution_id, inputs=inputs, performance_metrics=performance_metrics, users=users, configuration=configuration) return request
def do_score(self, solution_id, dataset_path, problem_path, ta2_id): try: problem = Problem.load(problem_uri=problem_path) except: logger.exception('Error parsing problem') # Encode metric metrics = [] for metric in problem['problem']['performance_metrics']: metrics.append(encode_performance_metric(metric)) # Showing only the first metric target_metric = problem['problem']['performance_metrics'][0]['metric'] logger.info('target_metric %s !', target_metric) response = self.core.ScoreSolution( pb_core.ScoreSolutionRequest( solution_id=solution_id, inputs=[ pb_value.Value(dataset_uri='file://%s' % dataset_path, ) ], performance_metrics=metrics, users=[], configuration=pb_core.ScoringConfiguration( method='HOLDOUT', train_test_ratio=0.75, shuffle=True, random_seed=0), )) logger.info('ScoreSolution response %s !', response) # Get Results results = self.core.GetScoreSolutionResults( pb_core.GetScoreSolutionResultsRequest( request_id=response.request_id, )) for result in results: logger.info('result %s !', result) if result.progress.state == pb_core.COMPLETED: scores = [] for metric_score in result.scores: metric = decode_performance_metric( metric_score.metric)['metric'] if metric == target_metric: score = decode_value(metric_score.value)['value'] scores.append(score) if len(scores) > 0: avg_score = round(sum(scores) / len(scores), 5) normalized_score = PerformanceMetric[ target_metric.name].normalize(avg_score) return { 'score': avg_score, 'normalized_score': normalized_score, 'metric': target_metric.name.lower() }
def fit_solution_request(solution_id, test_paths): inputs = [value_pb2.Value(dataset_uri=test_paths['TRAIN']['dataset'])] expose_outputs = ['outputs.0'] expose_value_types = ['CSV_URI'] users = [ core_pb2.SolutionRunUser(id='test_user', chosen=True, reason='just because') ] request = core_pb2.FitSolutionRequest( solution_id=solution_id, inputs=inputs, expose_outputs=expose_outputs, expose_value_types=expose_value_types, users=users) return request
def do_train(self, solution_id, dataset_path): fitted_solution = None try: response = self.core.FitSolution( pb_core.FitSolutionRequest( solution_id=solution_id, inputs=[pb_value.Value(dataset_uri=dataset_path, )], expose_outputs=[], expose_value_types=['CSV_URI'], users=[self.user], )) # Results results = self.core.GetFitSolutionResults( pb_core.GetFitSolutionResultsRequest( request_id=response.request_id, )) for result in results: if result.progress.state == pb_core.COMPLETED: fitted_solution = result.fitted_solution_id except: logger.exception("Exception training %r", solution_id) return fitted_solution
def search_solutions_request(test_paths, specified_template=None): user_agent = "test_agent" version = core_pb2.DESCRIPTOR.GetOptions().Extensions[ core_pb2.protocol_version] time_bound = 0.5 priority = 10 # allowed_value_types = [value_pb2.ValueType.Value(value) for value in ALLOWED_VALUE_TYPES] problem_description = utils.encode_problem_description( problem_module.Problem.load(test_paths['TRAIN']['problem'])) template = None if specified_template == 'FULL': with d3m_utils.silence(): pipeline = pipeline_utils.load_pipeline( FULL_SPECIFIED_PIPELINE_PATH) template = utils.encode_pipeline_description( pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT) elif specified_template == 'PRE': # PRE for PREPROCESSING pipeline = runtime_module.get_pipeline(PRE_SPECIFIED_PIPELINE_PATH, load_all_primitives=False) template = utils.encode_pipeline_description( pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT) inputs = [value_pb2.Value(dataset_uri=test_paths['TRAIN']['dataset'])] request = core_pb2.SearchSolutionsRequest( user_agent=user_agent, version=version, time_bound_search=time_bound, priority=priority, allowed_value_types=ALLOWED_VALUE_TYPES, problem=problem_description, template=template, inputs=inputs) return request