Beispiel #1
0
def main(experiment_type):
    hyper_parameters = {
        'n_epochs': [300],
        'discard_below_or_equal_to_value': [
            0
        ],  # Discard zeros works better than not discard. Test with discard lower than x..
        'correct_rainfall': [True]
    }

    assert experiment_type in (
        'learning_rate', 'base_test', 'compare_pos_weight', 'compare_rainfall',
        'export_normal', 'batch_size') or experiment_type.startswith(
            'exclude-') or experiment_type.startswith('evaluate-')

    VERBOSE = False

    if experiment_type in ('compare_pos_weight', 'compare_rainfall',
                           'learning_rate', 'batch_size', 'export_normal'):
        ITERATIONS = 10
        N_FOLDS = 5
    elif experiment_type.startswith('evaluate-') or experiment_type.startswith(
            'exclude-'):
        ITERATIONS = 10
        N_FOLDS = 5
    else:
        raise ValueError(
            f'iterations and n_folds not defined for {experiment_type}')

    if experiment_type == 'compare_pos_weight':
        POS_WEIGHTS = [0.50, 1.00, 2.00, 3.00, 5.00]
    else:
        POS_WEIGHTS = [2.00]

    if experiment_type == 'compare_rainfall':
        hyper_parameters['rainfall_dataset'] = ['GSMaP', 'PERSIANN']
    else:
        hyper_parameters['rainfall_dataset'] = ['GSMaP']

    if experiment_type.startswith('exclude-') or experiment_type.startswith(
            'evaluate-'):
        language = experiment_type[-2:]
        if experiment_type.startswith('exclude-'):
            hyper_parameters['split'] = [f'exclude-{language}']
        else:
            hyper_parameters['split'] = [f'evaluate-{language}']
    else:
        hyper_parameters['split'] = ['random']

    if experiment_type == 'learning_rate':
        hyper_parameters['learning_rate'] = [0.0005, 0.0001, 0.00005]
    else:
        hyper_parameters['learning_rate'] = [0.0005]

    if experiment_type == 'batch_size':
        hyper_parameters['batch_size'] = [32, 64, 128, 256, 512]
    else:
        hyper_parameters['batch_size'] = [128]

    if experiment_type == 'export_normal':
        save_model = True
    else:
        save_model = False

    # assert all hyperparmaters are in list for
    assert all(isinstance(value, list) for value in hyper_parameters.values())

    output_folder = os.path.join('results')
    try:
        os.makedirs(output_folder)
    except OSError:
        pass

    output_fn = os.path.join(output_folder, f'{experiment_type}.xlsx')
    if os.path.exists(output_fn):
        print('already finished')
        return None

    df_columns = [
        'iteration_n', 'fold_n', 'n_folds', '% pos val', 'test_model',
        'use_hydrology', 'pos_weight'
    ] + list(hyper_parameters.keys()) + ['precision', 'recall', 'loss']
    output_df = pd.DataFrame(columns=df_columns)

    keys, values = zip(*hyper_parameters.items())
    experiments = list(itertools.product(*values))
    n_experiments = len(experiments)

    for experiment_n, v in enumerate(experiments, start=1):
        settings = dict(zip(keys, v))
        t0 = datetime.now()
        fps = [
            os.path.join(
                'data', 'input',
                f"data_{sample_set}_correct_rainfall_{settings['correct_rainfall']}_discard_below_or_equal_to_value_{settings['discard_below_or_equal_to_value']}_{settings['rainfall_dataset']}.pickle"
            ) for sample_set in SAMPLE_SETS
        ]

        data_loader = DataLoader(fps,
                                 includes_context=True,
                                 includes_labels=True)
        data_loader.set_data(split=settings['split'],
                             n_folds=N_FOLDS,
                             iterations=ITERATIONS)

        experiment_iteration = 0
        for percent_positive_validation, run_data, iteration_n, fold_n in data_loader.data:
            experiment_iteration += 1
            for test_model in (False, ):
                for pos_weight in POS_WEIGHTS:
                    for use_hydrology in (True, False):
                        run_name = f'{pos_weight}_{use_hydrology}_{test_model}_{iteration_n}_{fold_n}'
                        if save_model:
                            save_model_path = os.path.join(
                                SAVE_DIR, f'best_model_{run_name}.ckpt')
                        else:
                            save_model_path = None
                        if use_hydrology:
                            run_data_sel = run_data
                        else:
                            run_data_sel = run_data[:-2]
                        best_val_score, best_val_loss = train(
                            run_data_sel,
                            run_name=run_name,
                            log=False,
                            verbose=VERBOSE,
                            pos_weight=pos_weight,
                            n_epochs=settings['n_epochs'],
                            learning_rate=settings['learning_rate'],
                            batch_size=settings['batch_size'],
                            test_model=test_model,
                            use_context=use_hydrology,
                            context_labels=data_loader.context_labels,
                            save_model_path=save_model_path)
                        output_df = output_df.append(pd.Series(
                                [iteration_n, fold_n, N_FOLDS, percent_positive_validation, test_model, use_hydrology, pos_weight] + \
                                list(settings.values()) + \
                                [
                                    best_val_score.loc['precision', 'flood'],
                                    best_val_score.loc['recall', 'flood'],
                                    best_val_loss
                                ],
                                index=df_columns
                        ), ignore_index=True)
                        print(output_df)
            print(
                f'Experiment {experiment_n}/{n_experiments} finished {experiment_iteration}/{len(data_loader)} iterations',
                end='\r')
        t1 = datetime.now()
        print(
            f'Experiment {experiment_n}/{n_experiments} finished {len(data_loader)}/{len(data_loader)} iterations in {t1 - t0}'
        )

        while True:
            try:
                output_df.to_excel(output_fn, index=False)
                break
            except PermissionError:
                print()
                input(f"Please close {output_fn} and press ENTER")
                print('OK')
Beispiel #2
0
class Pipeline(object):
    def __init__(self):
        self.config = pipeline_config.config
        #self.pipeline_config = self.config.pipeline_config

    def get_data_loader(self):
        data_phase = self.pipeline_config['data']
        data_module = __import__('load_data')

        if 'csv' in data_phase.keys():
            # For reading 'manually' filtered data from a saved dataframe
            csv_filename = data_phase['csv']
            csv_file = self.config.get_output_data_dir() + csv_filename
            self.dataloader = CSVLoader()
            self.dataloader.set_csv_file(csv_file)
        else:
            data_func = getattr(data_module, data_phase['dataloader'])
            data_file = data_phase['file']
            data = data_func(data_file)
            self.dataloader = DataLoader(label_identifier=data_phase['labels'])
            self.dataloader.set_data(data)
            if 'filters' in data_phase.keys():
                # Every filter is a function name in load_data taking a data_loader and returning it filtered
                data_filters = data_phase['filters']
                for filter_function_name in data_filters:
                    filter_function_name = 'filter_' + filter_function_name
                    filter_func = getattr(data_module, filter_function_name)
                    self.dataloader = filter_func(self.dataloader)

        return self.dataloader

    def execute(self):
        def capitalize(s):
            return s[0].upper() + s[1:]

        scenario = self.config.get_scenario()
        print(
            f"******* starting pipeline {self.config['pipeline']['cur_scenario']} ******"
        )
        for current_step in self.config.get_scenario():
            print(
                f" =================== Current step: {current_step} ============================="
            )
            self.config.set_current_pipeline_step(current_step)
            self.pipeline_config = self.config.pipeline_config

            logger.info('******** LOADING DATA *******')
            self.get_data_loader()

            # Training phase
            logger.info('******* TRAINING *******')
            print("get task type =========== ", self.config.get_task_type())
            if not 'disable' in self.pipeline_config['train'].keys() or \
                    self.pipeline_config['train']['disable'] is not True:
                train_phase = self.pipeline_config['train']
                train_module = __import__('train')
                converter_module = __import__('converter')
                converter_class = getattr(
                    converter_module,
                    capitalize(train_phase['converter']) + 'Converter')
                if self.config.get_task_type() == 'sequence':
                    trainer_class_name = 'SpanTrainer'
                else:
                    trainer_class_name = 'ClassificationTrainer'
                trainer_class = getattr(train_module, trainer_class_name)
                trainer = trainer_class(self.dataloader, converter_class())
                trainer.train()
            else:
                logger.info(">>> training disabled")
                train_phase = self.pipeline_config['train']
                train_module = __import__('train')
                converter_module = __import__('converter')
                converter_class = getattr(
                    converter_module,
                    capitalize(train_phase['converter']) + 'Converter')

            # Evaluation phase
            logger.info('******* EVALUATION *******')
            if not 'disable' in self.pipeline_config['eval'].keys() or \
                    self.pipeline_config['eval']['disable'] is not True:

                if self.config.get_task_type() == 'sequence':
                    #evaluator = SequenceEvaluator(self.dataloader)
                    evaluator = SpanEvaluator(self.dataloader)
                else:
                    evaluator = ClassificationEvaluator(self.dataloader)

                evaluator.evaluate(converter_class())

                # Create a score file for evaluation
                if self.config.get_task_type() != 'sequence':
                    from score import score_task1
                    pred_file = self.config.get_output_data_dir(
                    ) + 'predict1.txt'
                    gold_file = self.config.get_output_data_dir() + 'true1.txt'
                    score_task1(predict_file=pred_file, true_file=gold_file)

                    # Run the scorer
                    sys.path.append(
                        os.path.realpath('SEMEVAL-2021-task6-corpus'))

                    from scorer.task1_3 import evaluate, validate_files  # (pred_fpath, gold_fpath, CLASSES):
                    # from format_checker.task1_3 import validate_files
                    CLASSES = read_labels_from_file(
                        self.pipeline_config['data']['labels'])

                    if validate_files(pred_file, gold_file, CLASSES):
                        logger.info('Prediction file format is correct')
                        macro_f1, micro_f1 = evaluate(pred_file, gold_file,
                                                      CLASSES)
                        logger.info("macro-F1={:.5f}\tmicro-F1={:.5f}".format(
                            macro_f1, micro_f1))
                    else:
                        print("Failed to validate prediction & gold files")

                else:
                    print("No scoring for sequence type")

            else:
                print("Evaluation is disabled")

            # Post-processing phase
            if 'postprocess' in self.pipeline_config.keys() and \
                    self.pipeline_config['postprocess']['disable'] is not True:
                logger.info('******* POST-PROCESSING *******')
                postprocess_phase = self.pipeline_config['postprocess']
                postprocess_module = __import__('postprocess')
                postprocess_class = getattr(
                    postprocess_module,
                    postprocess_phase['processor']['class'])
                postprocessor = postprocess_class(
                    self.config.get_model_type(),
                    self.config.get_output_dir(),
                    **postprocess_phase[
                        'processor']  # This should return a dictionary
                )
                postprocessor.execute()

            else:
                print('No post-processing defined')