def do_score(self, solution_id, dataset_path, problem_path, ta2_id): try: problem = Problem.load(problem_uri=problem_path) except: logger.exception('Error parsing problem') # Encode metric metrics = [] for metric in problem['problem']['performance_metrics']: metrics.append(encode_performance_metric(metric)) # Showing only the first metric target_metric = problem['problem']['performance_metrics'][0]['metric'] logger.info('target_metric %s !', target_metric) response = self.core.ScoreSolution( pb_core.ScoreSolutionRequest( solution_id=solution_id, inputs=[ pb_value.Value(dataset_uri='file://%s' % dataset_path, ) ], performance_metrics=metrics, users=[], configuration=pb_core.ScoringConfiguration( method='HOLDOUT', train_test_ratio=0.75, shuffle=True, random_seed=0), )) logger.info('ScoreSolution response %s !', response) # Get Results results = self.core.GetScoreSolutionResults( pb_core.GetScoreSolutionResultsRequest( request_id=response.request_id, )) for result in results: logger.info('result %s !', result) if result.progress.state == pb_core.COMPLETED: scores = [] for metric_score in result.scores: metric = decode_performance_metric( metric_score.metric)['metric'] if metric == target_metric: score = decode_value(metric_score.value)['value'] scores.append(score) if len(scores) > 0: avg_score = round(sum(scores) / len(scores), 5) normalized_score = PerformanceMetric[ target_metric.name].normalize(avg_score) return { 'score': avg_score, 'normalized_score': normalized_score, 'metric': target_metric.name.lower() }
def _build_problem(self, dataset): problem = Problem.load(problem_uri=self._get_problem_doc_path(dataset)) return encode_problem_description(problem)
def _build_problem(self, problem_path): problem = Problem.load(problem_uri=problem_path) return encode_problem_description(problem)
def search_pipelines(datasets, time_bound=10, use_template=False): search_results_path = join(D3MOUTPUTDIR, 'temp', 'search_results.json') search_results = load_search_results(search_results_path) channel = grpc.insecure_channel('localhost:45042') core = LoggingStub(pb_core_grpc.CoreStub(channel), logger) size = len(datasets) pipeline_template = None if use_template: pipeline_template = load_template() for i, dataset in enumerate(datasets): logger.info('Processing dataset "%s" (%d/%d)' % (dataset, i + 1, size)) start_time = datetime.now() dataset_train_path = join(D3MINPUTDIR, dataset, 'TRAIN/dataset_TRAIN/datasetDoc.json') problem_path = join(D3MINPUTDIR, dataset, 'TRAIN/problem_TRAIN/problemDoc.json') if not os.path.isfile(problem_path): logger.error('Problem file (%s) doesnt exist', problem_path) continue try: problem = Problem.load(problem_uri=fix_uri(problem_path)) except: logger.exception('Error parsing problem') continue task_keywords = '_'.join( [x.name for x in problem['problem']['task_keywords']]) search_id, pipelines = do_search(core, problem, dataset_train_path, time_bound=time_bound, pipelines_limit=0, pipeline_template=pipeline_template) #print(dataset, problem['problem']['performance_metrics'][0]['metric'].name, task_keywords) number_pipelines = len(pipelines) result = { 'search_id': search_id, 'task': task_keywords, 'search_time': str(datetime.now() - start_time), 'pipelines': number_pipelines, 'best_time': 'None', 'best_score': 'None', 'all_scores': [] } if number_pipelines > 0: best_time = sorted(pipelines.values(), key=lambda x: x[2])[0][2] sorted_pipelines = sorted(pipelines.items(), key=lambda x: x[1][0], reverse=True) all_scores = [] for pipeline_id, (_, pipeline, _) in sorted_pipelines: if use_template: # FIXME: Pipeline score is not calculate when working with fully defined pipeline pipeline_score = 1.0 else: pipeline_score = decode_value( pipeline[0].scores[0].value)['value'] all_scores.append({'id': pipeline_id, 'score': pipeline_score}) #do_score(core, problem, [pipeline_id], dataset_train_path) #fitted_pipeline = do_train(core, [pipeline_id], dataset_train_path) #do_save_fitted_solution(core, fitted_pipeline) #do_test(core, fitted_pipeline, dataset_train_path.replace('TRAIN', 'TEST')) #do_export(core, fitted_pipeline) #do_describe(core, [pipeline_id]) result['pipelines'] = number_pipelines result['best_time'] = best_time result['best_score'] = all_scores[0]['score'] result['all_scores'] = all_scores search_results[dataset] = result with open(search_results_path, 'w') as fout: json.dump(search_results, fout, indent=4)
def load_problem(root_path, phase): path = os.path.join(root_path, phase, 'problem_' + phase, 'problemDoc.json') return Problem.load(problem_uri='file://' + os.path.abspath(path))
# pp.fit_and_produce() # pp.save() data_dir = "/Users/muxin/Desktop/ISI/dsbox-env/output/seed/38_sick/" log_dir = '/Users/muxin/Desktop/studies/master/2018Summer/data/log' pids = [ '32b24d72-44c6-4956-bc21-835cb42f0f2e', 'a8f4001a-64f4-4ff1-a89d-3548f4dfeb88', '5e1d9723-ec02-46d2-abdf-46389fba8e52' ] dataset = container.Dataset.load( 'file:///Users/muxin/Desktop/ISI/dsbox-env/data/datasets/seed_datasets_current/38_sick/38_sick_dataset/datasetDoc.json' ) set_target_column(dataset) problem_doc_path = os.path.abspath( "/Users/muxin/Desktop/ISI/dsbox-env/data/datasets/seed_datasets_current/38_sick/38_sick_problem/problemDoc.json" ) problem = Problem.load('file://' + problem_doc_path) with open(problem_doc_path) as file: problem_doc = json.load(file) qq = HorizontalTuningPipeline(pipeline_files_dir=data_dir, pids=None, problem=problem, train_dataset=dataset, test_dataset=dataset) qq.generate_candidate_pids() print(qq.pids) qq.generate_ensemble_pipeline() qq.fit_and_produce() print(qq.fitted_pipeline.get_produce_step_output(0)) qq.save()
def _load_problem(self): if self.problem_schema == '': return self.problem = Problem.load('file://' + os.path.abspath(self.problem_schema)) self._load_problem_rest()
def load_problem(root_path, phase): path = os.path.join(root_path, phase, 'problem_' + phase, 'problemDoc.json') return Problem.load(problem_uri=path)