def main(experiment_type): hyper_parameters = { 'n_epochs': [300], 'discard_below_or_equal_to_value': [ 0 ], # Discard zeros works better than not discard. Test with discard lower than x.. 'correct_rainfall': [True] } assert experiment_type in ( 'learning_rate', 'base_test', 'compare_pos_weight', 'compare_rainfall', 'export_normal', 'batch_size') or experiment_type.startswith( 'exclude-') or experiment_type.startswith('evaluate-') VERBOSE = False if experiment_type in ('compare_pos_weight', 'compare_rainfall', 'learning_rate', 'batch_size', 'export_normal'): ITERATIONS = 10 N_FOLDS = 5 elif experiment_type.startswith('evaluate-') or experiment_type.startswith( 'exclude-'): ITERATIONS = 10 N_FOLDS = 5 else: raise ValueError( f'iterations and n_folds not defined for {experiment_type}') if experiment_type == 'compare_pos_weight': POS_WEIGHTS = [0.50, 1.00, 2.00, 3.00, 5.00] else: POS_WEIGHTS = [2.00] if experiment_type == 'compare_rainfall': hyper_parameters['rainfall_dataset'] = ['GSMaP', 'PERSIANN'] else: hyper_parameters['rainfall_dataset'] = ['GSMaP'] if experiment_type.startswith('exclude-') or experiment_type.startswith( 'evaluate-'): language = experiment_type[-2:] if experiment_type.startswith('exclude-'): hyper_parameters['split'] = [f'exclude-{language}'] else: hyper_parameters['split'] = [f'evaluate-{language}'] else: hyper_parameters['split'] = ['random'] if experiment_type == 'learning_rate': hyper_parameters['learning_rate'] = [0.0005, 0.0001, 0.00005] else: hyper_parameters['learning_rate'] = [0.0005] if experiment_type == 'batch_size': hyper_parameters['batch_size'] = [32, 64, 128, 256, 512] else: hyper_parameters['batch_size'] = [128] if experiment_type == 'export_normal': save_model = True else: save_model = False # assert all hyperparmaters are in list for assert all(isinstance(value, list) for value in hyper_parameters.values()) output_folder = os.path.join('results') try: os.makedirs(output_folder) except OSError: pass output_fn = os.path.join(output_folder, f'{experiment_type}.xlsx') if os.path.exists(output_fn): print('already finished') return None df_columns = [ 'iteration_n', 'fold_n', 'n_folds', '% pos val', 'test_model', 'use_hydrology', 'pos_weight' ] + list(hyper_parameters.keys()) + ['precision', 'recall', 'loss'] output_df = pd.DataFrame(columns=df_columns) keys, values = zip(*hyper_parameters.items()) experiments = list(itertools.product(*values)) n_experiments = len(experiments) for experiment_n, v in enumerate(experiments, start=1): settings = dict(zip(keys, v)) t0 = datetime.now() fps = [ os.path.join( 'data', 'input', f"data_{sample_set}_correct_rainfall_{settings['correct_rainfall']}_discard_below_or_equal_to_value_{settings['discard_below_or_equal_to_value']}_{settings['rainfall_dataset']}.pickle" ) for sample_set in SAMPLE_SETS ] data_loader = DataLoader(fps, includes_context=True, includes_labels=True) data_loader.set_data(split=settings['split'], n_folds=N_FOLDS, iterations=ITERATIONS) experiment_iteration = 0 for percent_positive_validation, run_data, iteration_n, fold_n in data_loader.data: experiment_iteration += 1 for test_model in (False, ): for pos_weight in POS_WEIGHTS: for use_hydrology in (True, False): run_name = f'{pos_weight}_{use_hydrology}_{test_model}_{iteration_n}_{fold_n}' if save_model: save_model_path = os.path.join( SAVE_DIR, f'best_model_{run_name}.ckpt') else: save_model_path = None if use_hydrology: run_data_sel = run_data else: run_data_sel = run_data[:-2] best_val_score, best_val_loss = train( run_data_sel, run_name=run_name, log=False, verbose=VERBOSE, pos_weight=pos_weight, n_epochs=settings['n_epochs'], learning_rate=settings['learning_rate'], batch_size=settings['batch_size'], test_model=test_model, use_context=use_hydrology, context_labels=data_loader.context_labels, save_model_path=save_model_path) output_df = output_df.append(pd.Series( [iteration_n, fold_n, N_FOLDS, percent_positive_validation, test_model, use_hydrology, pos_weight] + \ list(settings.values()) + \ [ best_val_score.loc['precision', 'flood'], best_val_score.loc['recall', 'flood'], best_val_loss ], index=df_columns ), ignore_index=True) print(output_df) print( f'Experiment {experiment_n}/{n_experiments} finished {experiment_iteration}/{len(data_loader)} iterations', end='\r') t1 = datetime.now() print( f'Experiment {experiment_n}/{n_experiments} finished {len(data_loader)}/{len(data_loader)} iterations in {t1 - t0}' ) while True: try: output_df.to_excel(output_fn, index=False) break except PermissionError: print() input(f"Please close {output_fn} and press ENTER") print('OK')
class Pipeline(object): def __init__(self): self.config = pipeline_config.config #self.pipeline_config = self.config.pipeline_config def get_data_loader(self): data_phase = self.pipeline_config['data'] data_module = __import__('load_data') if 'csv' in data_phase.keys(): # For reading 'manually' filtered data from a saved dataframe csv_filename = data_phase['csv'] csv_file = self.config.get_output_data_dir() + csv_filename self.dataloader = CSVLoader() self.dataloader.set_csv_file(csv_file) else: data_func = getattr(data_module, data_phase['dataloader']) data_file = data_phase['file'] data = data_func(data_file) self.dataloader = DataLoader(label_identifier=data_phase['labels']) self.dataloader.set_data(data) if 'filters' in data_phase.keys(): # Every filter is a function name in load_data taking a data_loader and returning it filtered data_filters = data_phase['filters'] for filter_function_name in data_filters: filter_function_name = 'filter_' + filter_function_name filter_func = getattr(data_module, filter_function_name) self.dataloader = filter_func(self.dataloader) return self.dataloader def execute(self): def capitalize(s): return s[0].upper() + s[1:] scenario = self.config.get_scenario() print( f"******* starting pipeline {self.config['pipeline']['cur_scenario']} ******" ) for current_step in self.config.get_scenario(): print( f" =================== Current step: {current_step} =============================" ) self.config.set_current_pipeline_step(current_step) self.pipeline_config = self.config.pipeline_config logger.info('******** LOADING DATA *******') self.get_data_loader() # Training phase logger.info('******* TRAINING *******') print("get task type =========== ", self.config.get_task_type()) if not 'disable' in self.pipeline_config['train'].keys() or \ self.pipeline_config['train']['disable'] is not True: train_phase = self.pipeline_config['train'] train_module = __import__('train') converter_module = __import__('converter') converter_class = getattr( converter_module, capitalize(train_phase['converter']) + 'Converter') if self.config.get_task_type() == 'sequence': trainer_class_name = 'SpanTrainer' else: trainer_class_name = 'ClassificationTrainer' trainer_class = getattr(train_module, trainer_class_name) trainer = trainer_class(self.dataloader, converter_class()) trainer.train() else: logger.info(">>> training disabled") train_phase = self.pipeline_config['train'] train_module = __import__('train') converter_module = __import__('converter') converter_class = getattr( converter_module, capitalize(train_phase['converter']) + 'Converter') # Evaluation phase logger.info('******* EVALUATION *******') if not 'disable' in self.pipeline_config['eval'].keys() or \ self.pipeline_config['eval']['disable'] is not True: if self.config.get_task_type() == 'sequence': #evaluator = SequenceEvaluator(self.dataloader) evaluator = SpanEvaluator(self.dataloader) else: evaluator = ClassificationEvaluator(self.dataloader) evaluator.evaluate(converter_class()) # Create a score file for evaluation if self.config.get_task_type() != 'sequence': from score import score_task1 pred_file = self.config.get_output_data_dir( ) + 'predict1.txt' gold_file = self.config.get_output_data_dir() + 'true1.txt' score_task1(predict_file=pred_file, true_file=gold_file) # Run the scorer sys.path.append( os.path.realpath('SEMEVAL-2021-task6-corpus')) from scorer.task1_3 import evaluate, validate_files # (pred_fpath, gold_fpath, CLASSES): # from format_checker.task1_3 import validate_files CLASSES = read_labels_from_file( self.pipeline_config['data']['labels']) if validate_files(pred_file, gold_file, CLASSES): logger.info('Prediction file format is correct') macro_f1, micro_f1 = evaluate(pred_file, gold_file, CLASSES) logger.info("macro-F1={:.5f}\tmicro-F1={:.5f}".format( macro_f1, micro_f1)) else: print("Failed to validate prediction & gold files") else: print("No scoring for sequence type") else: print("Evaluation is disabled") # Post-processing phase if 'postprocess' in self.pipeline_config.keys() and \ self.pipeline_config['postprocess']['disable'] is not True: logger.info('******* POST-PROCESSING *******') postprocess_phase = self.pipeline_config['postprocess'] postprocess_module = __import__('postprocess') postprocess_class = getattr( postprocess_module, postprocess_phase['processor']['class']) postprocessor = postprocess_class( self.config.get_model_type(), self.config.get_output_dir(), **postprocess_phase[ 'processor'] # This should return a dictionary ) postprocessor.execute() else: print('No post-processing defined')