def _split_row(self, input_dataset): """ Inner function to sample part of the row of the input dataset adapted from d3m's common-primitives Returns ------- Dataset The sampled Dataset """ row_length = input_dataset[self._main_resource_id].shape[0] all_indexes_list = range(1, row_length) sample_indices = random.sample(all_indexes_list, self._threshold_row_length) # We store row as sets, but later on we sort them when we cut DataFrames. row_indices_to_keep_sets: typing.Dict[ str, typing.Set[int]] = collections.defaultdict(set) row_indices_to_keep_sets[self._main_resource_id] = set(sample_indices) # We sort indices to get deterministic outputs from sets (which do not have deterministic order). self._row_remained = { resource_id: sorted(indices) for resource_id, indices in row_indices_to_keep_sets.items() } output_dataset = Dataset.select_rows(input_dataset, self._row_remained) return output_dataset
def _get_sample_uri(self, dataset_uri, problem): logger.info('About to sample dataset %s', dataset_uri) task_keywords = problem['problem']['task_keywords'] if any(tk in [TaskKeyword.OBJECT_DETECTION, TaskKeyword.FORECASTING] for tk in task_keywords): logger.info('Not doing sampling for task %s', '_'.join([x.name for x in task_keywords])) return None dataset = Dataset.load(dataset_uri) if is_collection(dataset_uri[7:]): logger.info('Not doing sampling for collections') return None dataset_sample_folder = 'file://%s/temp/dataset_sample/' % os.environ.get('D3MOUTPUTDIR') dataset_sample_uri = None if os.path.exists(dataset_sample_folder[6:]): shutil.rmtree(dataset_sample_folder[6:]) dataset_sample = get_dataset_sample(dataset, problem, dataset_sample_folder) if isinstance(dataset_sample, str): # Was the dataset sampled? dataset_sample_uri = dataset_sample return dataset_sample_uri
def _find_timeseries_metadata( cls, dataset: container.Dataset ) -> typing.Optional[metadata_base.DataMetadata]: # loop over the dataset to find the resource that contains the timeseries file col info for resource_id, resource in dataset.items(): metadata = dataset.metadata.query((resource_id, "ALL_ELEMENTS", 0)) if "file_columns" in metadata: return metadata return None
def execute(pipeline_id, dataset, problem, results_path, msg_queue, db): # Get pipeline from database pipeline = (db.query( database.Pipeline).filter(database.Pipeline.id == pipeline_id).options( joinedload(database.Pipeline.modules), joinedload(database.Pipeline.connections))).one() logger.info('About to execute pipeline, id=%s, dataset=%r', pipeline_id, dataset) # Load data dataset = Dataset.load(dataset) logger.info('Loaded dataset') json_pipeline = convert.to_d3m_json(pipeline) logger.info( 'Pipeline to be executed:\n%s', '\n'.join( [x['primitive']['python_path'] for x in json_pipeline['steps']])) d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure( json_pipeline, ) runtime = d3m.runtime.Runtime(pipeline=d3m_pipeline, problem_description=problem, context=metadata_base.Context.TESTING) manager = Manager() return_dict = manager.dict() p = Process(target=worker, args=(runtime, dataset, return_dict)) p.start() p.join(180) # Maximum 3 minutes fit_results = return_dict['fit_results'] fit_results.check_success() if results_path is not None: logger.info('Storing fit results at %s', results_path) fit_results.values['outputs.0'].to_csv(results_path) else: logger.info('NOT storing fit results') return fit_results.values
def test(pipeline_id, dataset, storage_dir, steps_to_expose, msg_queue, db): dataset = Dataset.load(dataset) logger.info('Loaded dataset') runtime = None with open( os.path.join(storage_dir, 'fitted_solution_%s.pkl' % pipeline_id), 'rb') as fin: runtime = pickle.load(fin) results = runtime.produce(inputs=[dataset], outputs_to_expose=steps_to_expose) results.check_success() logger.info('Storing produce results at %s', storage_dir) for step_id in results.values: if step_id in steps_to_expose and isinstance(results.values[step_id], DataFrame): results.values[step_id].to_csv( join(storage_dir, 'produce_%s_%s.csv' % (pipeline_id, step_id)))
def train(pipeline_id, dataset, problem, storage_dir, steps_to_expose, msg_queue, db): # Get pipeline from database pipeline = ( db.query(database.Pipeline) .filter(database.Pipeline.id == pipeline_id) .options(joinedload(database.Pipeline.modules), joinedload(database.Pipeline.connections)) ).one() logger.info('About to train pipeline, id=%s, dataset=%r', pipeline_id, dataset) # Load data dataset = Dataset.load(dataset) logger.info('Loaded dataset') # Training step - fit pipeline on training data logger.info('Running training') d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure( convert.to_d3m_json(pipeline), ) expose_outputs = True if len(steps_to_expose) > 0 else False fitted_pipeline, predictions, results = d3m.runtime.fit(d3m_pipeline, [dataset], problem_description=problem, context=metadata_base.Context.TESTING, volumes_dir=os.environ.get('D3MSTATICDIR', None), random_seed=0, expose_produced_outputs=expose_outputs) results.check_success() logger.info('Storing fit results at %s', storage_dir) for step_id in results.values: if step_id in steps_to_expose and isinstance(results.values[step_id], DataFrame): results.values[step_id].to_csv(join(storage_dir, 'fit_%s_%s.csv' % (pipeline_id, step_id))) with open(join(storage_dir, 'fitted_solution_%s.pkl' % pipeline_id), 'wb') as fout: pickle.dump(fitted_pipeline, fout)
def sample_rows( dataset: container.Dataset, main_resource_id: str, main_resource_indices_to_keep: typing.Set[int], relations_graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]], *, delete_recursive: bool = False, ) -> container.Dataset: # We store rows as sets, but later on we sort them when we select rows. row_indices_to_keep_sets: typing.Dict[ str, typing.Set[int]] = collections.defaultdict(set) row_indices_to_keep_sets[main_resource_id] = main_resource_indices_to_keep # If "delete_recursive" is set to "False", we do not populate "row_indices_to_keep_sets" # with other resources, making "select_rows" simply keep them. if delete_recursive: # We sort to be deterministic. for main_resource_row_index in sorted( row_indices_to_keep_sets[main_resource_id]): queue = [] queue.append((main_resource_id, [main_resource_row_index])) while queue: current_resource_id, current_row_indices = queue.pop(0) current_resource = dataset[current_resource_id] for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in relations_graph[ current_resource_id]: # All rows from the main resource we want are already there. # TODO: What to do if we get a reference to the row in the main resource which is not part of this sample? # This means that probably the sample is invalid. We should not be generating such samples which do not # preserve reference loops and their consistency. Otherwise it is not really possible to denormalize # such Dataset properly: a reference is referencing a row in the main resource which does not exist. if edge_resource_id == main_resource_id: continue edge_resource = dataset[edge_resource_id] to_column_values = edge_resource.iloc[:, edge_to_index] for from_column_value in current_resource.iloc[ current_row_indices, edge_from_index]: # We assume here that "index" corresponds to the default index with row indices. rows_with_value = edge_resource.index[ to_column_values == from_column_value] # We sort to be deterministic. new_rows_list = sorted( set(rows_with_value) - row_indices_to_keep_sets[edge_resource_id]) row_indices_to_keep_sets[edge_resource_id].update( new_rows_list) queue.append((edge_resource_id, new_rows_list)) # We sort indices to get deterministic outputs from sets (which do not have deterministic order). # We also do not want to change the row order but keep the original row order. # Sorting by row indices values assure that. row_indices_to_keep = { resource_id: sorted(indices) for resource_id, indices in row_indices_to_keep_sets.items() } return dataset.select_rows(row_indices_to_keep)
def get_tabular_resource( dataset: container.Dataset, resource_id: typing.Optional[str], *, pick_entry_point: bool = True, pick_one: bool = True, has_hyperparameter: bool = True, ) -> typing.Tuple[str, container.DataFrame]: if resource_id is None and pick_entry_point: for dataset_resource_id in dataset.keys(): if dataset.metadata.has_semantic_type(( dataset_resource_id, ), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint' ): resource_id = dataset_resource_id break if resource_id is None and pick_one: tabular_resource_ids = [ dataset_resource_id for dataset_resource_id, dataset_resource in dataset.items() if isinstance(dataset_resource, container.DataFrame) ] if len(tabular_resource_ids) == 1: resource_id = tabular_resource_ids[0] if resource_id is None: if has_hyperparameter: if pick_entry_point and pick_one: raise ValueError( "A Dataset with multiple tabular resources without an entry point and no resource specified as a hyper-parameter." ) elif pick_entry_point: raise ValueError( "A Dataset without an entry point and no resource specified as a hyper-parameter." ) elif pick_one: raise ValueError( "A Dataset with multiple tabular resources and no resource specified as a hyper-parameter." ) else: raise ValueError("No resource specified as a hyper-parameter.") else: if pick_entry_point and pick_one: raise ValueError( "A Dataset with multiple tabular resources without an entry point." ) elif pick_entry_point: raise ValueError("A Dataset without an entry point.") elif pick_one: raise ValueError("A Dataset with multiple tabular resources.") else: raise ValueError("No resource specified.") else: resource = dataset[resource_id] if not isinstance(resource, container.DataFrame): raise TypeError( "The Dataset resource '{resource_id}' is not a DataFrame, but '{type}'." .format( resource_id=resource_id, type=type(resource), )) return resource_id, resource
def tune(pipeline_id, metrics, problem, dataset_uri, sample_dataset_uri, report_rank, timeout_tuning, timeout_run, msg_queue, db): timeout_tuning = timeout_tuning * 0.9 # FIXME: Save 10% of timeout to score the best config # Load pipeline from database pipeline = ( db.query(database.Pipeline) .filter(database.Pipeline.id == pipeline_id) .options(joinedload(database.Pipeline.modules), joinedload(database.Pipeline.connections)) ).one() logger.info('About to tune pipeline, id=%s, dataset=%r, timeout=%d secs', pipeline_id, dataset_uri, timeout_tuning) tunable_primitives = {} for primitive in pipeline.modules: if is_tunable(primitive.name): tunable_primitives[primitive.id] = primitive.name if len(tunable_primitives) == 0: logger.info('No primitives to be tuned for pipeline %s', pipeline_id) sys.exit(1) logger.info('Tuning primitives: %s', ', '.join(tunable_primitives.values())) if sample_dataset_uri: dataset = Dataset.load(sample_dataset_uri) else: dataset = Dataset.load(dataset_uri) task_keywords = problem['problem']['task_keywords'] scoring_config = {'shuffle': 'true', 'stratified': 'true' if TaskKeyword.CLASSIFICATION in task_keywords else 'false', 'method': 'K_FOLD', 'number_of_folds': '2'} metrics_to_use = deepcopy(metrics) if metrics[0]['metric'] == PerformanceMetric.F1 and TaskKeyword.SEMISUPERVISED in problem['problem']['task_keywords']: metrics_to_use = [{'metric': PerformanceMetric.F1_MACRO}] def evaluate_tune(hyperparameter_configuration): new_hyperparams = [] for primitive_id, primitive_name in tunable_primitives.items(): hy = get_new_hyperparameters(primitive_name, hyperparameter_configuration) db_hyperparams = database.PipelineParameter( pipeline=pipeline, module_id=primitive_id, name='hyperparams', value=pickle.dumps(hy), ) new_hyperparams.append(db_hyperparams) pipeline.parameters += new_hyperparams scores = evaluate(pipeline, kfold_tabular_split, dataset, metrics_to_use, problem, scoring_config, dataset_uri, timeout_run) first_metric = metrics_to_use[0]['metric'].name score_values = [] for fold_scores in scores.values(): for metric, score_value in fold_scores.items(): if metric == first_metric: score_values.append(score_value) avg_score = sum(score_values) / len(score_values) cost = 1.0 - metrics_to_use[0]['metric'].normalize(avg_score) logger.info('Tuning results:\n%s, cost=%s', scores, cost) return cost # Run tuning, gets best configuration tuning = HyperparameterTuning(tunable_primitives.values()) create_outputfolders(join(os.environ.get('D3MOUTPUTDIR'), 'temp', 'tuning')) best_configuration = tuning.tune(evaluate_tune, wallclock=timeout_tuning, output_dir=join(os.environ.get('D3MOUTPUTDIR'), 'temp', 'tuning', str(pipeline_id))) # Duplicate pipeline in database new_pipeline = database.duplicate_pipeline(db, pipeline, 'HyperparameterTuning from pipeline %s' % pipeline_id) for primitive in new_pipeline.modules: if is_tunable(primitive.name): best_hyperparameters = get_new_hyperparameters(primitive.name, best_configuration) query = db.query(database.PipelineParameter).filter(database.PipelineParameter.module_id == primitive.id)\ .filter(database.PipelineParameter.pipeline_id == new_pipeline.id)\ .filter(database.PipelineParameter.name == 'hyperparams') if query.first(): original_parameters = pickle.loads(query.first().value) original_parameters.update(best_hyperparameters) query.update({database.PipelineParameter.value: pickle.dumps(original_parameters)}) else: db.add(database.PipelineParameter( pipeline=new_pipeline, module_id=primitive.id, name='hyperparams', value=pickle.dumps(best_hyperparameters), )) db.commit() logger.info('Tuning done, generated new pipeline %s', new_pipeline.id) shutil.rmtree(join(os.environ.get('D3MOUTPUTDIR'), 'temp', 'tuning', str(pipeline_id))) score(new_pipeline.id, dataset_uri, sample_dataset_uri, metrics, problem, scoring_config, timeout_run, report_rank, None, db_filename=join(os.environ.get('D3MOUTPUTDIR'), 'temp', 'db.sqlite3')) # TODO: Change this static string path msg_queue.send(('tuned_pipeline_id', new_pipeline.id))
def score(pipeline_id, dataset_uri, sample_dataset_uri, metrics, problem, scoring_config, timeout_run, report_rank, msg_queue, db): dataset_uri_touse = dataset_uri if sample_dataset_uri: dataset_uri_touse = sample_dataset_uri if TaskKeyword.FORECASTING in problem['problem']['task_keywords']: check_timeindicator(dataset_uri_touse[7:]) dataset = Dataset.load(dataset_uri_touse) # Get pipeline from database pipeline = (db.query( database.Pipeline).filter(database.Pipeline.id == pipeline_id).options( joinedload(database.Pipeline.modules), joinedload(database.Pipeline.connections))).one() logger.info('About to score pipeline, id=%s, metrics=%s, dataset=%r', pipeline_id, metrics, dataset_uri) scores = {} scores_db = [] pipeline_split = None if TaskKeyword.FORECASTING in problem['problem']['task_keywords']: pipeline_split = kfold_timeseries_split elif scoring_config['method'] == 'K_FOLD': pipeline_split = kfold_tabular_split elif scoring_config['method'] == 'HOLDOUT': pipeline_split = train_test_tabular_split elif scoring_config['method'] == 'RANKING': # For TA2 only evaluation scoring_config['number_of_folds'] = '4' report_rank = True pipeline_split = kfold_tabular_split else: logger.warning('Unknown evaluation method, using K_FOLD') pipeline_split = kfold_tabular_split if metrics[0][ 'metric'] == PerformanceMetric.F1 and TaskKeyword.SEMISUPERVISED in problem[ 'problem']['task_keywords']: new_metrics = [{'metric': PerformanceMetric.F1_MACRO}] scores = evaluate(pipeline, kfold_tabular_split, dataset, new_metrics, problem, scoring_config, dataset_uri, timeout_run) scores = change_name_metric(scores, new_metrics, new_metric=metrics[0]['metric'].name) else: scores = evaluate(pipeline, pipeline_split, dataset, metrics, problem, scoring_config, dataset_uri, timeout_run) logger.info("Evaluation results:\n%s", scores) if len(scores) > 0: # It's a valid pipeline scores_db = add_scores_db(scores, scores_db) if report_rank: # For TA2 only evaluation scores = create_rank_metric(scores, metrics) scores_db = add_scores_db(scores, scores_db) logger.info("Evaluation results for RANK metric: \n%s", scores) # TODO Should we rename CrossValidation table? record_db = database.CrossValidation(pipeline_id=pipeline_id, scores=scores_db) # Store scores db.add(record_db) db.commit()
def __init__(self, dataset, targets=None, features=None, DBSession=None): self.dataset = Dataset.load(dataset) self.dataset_uri = dataset self.DBSession = DBSession self.targets = targets self.features = features