def cleanup_outdated_predictions(results_folder: str = None,
                                 dry_run=True,
                                 ignore_filter: str = 'dummy'):
    folder = 'data/results/{}'.format(
        results_folder) if results_folder else get_result_folders()[-1]
    result_files = get_result_filenames_from_folder(folder)

    for result_file in result_files:
        if ignore_filter and ignore_filter in result_file: continue

        prediction_file = '{}/predictions/{}'.format(
            folder, filename_utils.get_filename_only(result_file))
        predictions_exist = os.path.exists(prediction_file)

        if not predictions_exist:
            LOGGER.warning(
                'Did not find prediction file for: {}'.format(result_file))
            continue

        with open(result_file, 'rb') as f:
            result_data = pickle.load(f)
        with open(prediction_file, 'rb') as f:
            r = pickle.load(f)

        result_git_commit = result_data['meta_data']['git_commit']
        git_commit = r['meta_data']['git_commit']
        if result_git_commit != git_commit:
            if dry_run:
                LOGGER.info(
                    'Outdated prediction: {}. dry_run=True, so it will not get deleted'
                    .format(prediction_file))
                continue
            LOGGER.info(
                'Outdated prediction: {}. Deleting'.format(prediction_file))
            os.remove(prediction_file)
Exemple #2
0
    def preprocess_args(config):
        config['device'] = get_device()
        config['n_classes'] = 2 if config['loss_func'] == 'ce' else 1

        # Check all provided paths:
        if not os.path.exists(config['data_path']):
            raise ValueError("[!] ERROR: Dataset path does not exist")
        else:
            LOGGER.info("Data path checked..")
        if not os.path.exists(config['model_path']):
            LOGGER.warning(
                "Creating checkpoint path for saved models at:  {}\n".format(
                    config['model_path']))
            os.makedirs(config['model_path'])
        else:
            LOGGER.info("Model save path checked..")
        if 'config' in config:
            if not os.path.isfile(config['config']):
                raise ValueError("[!] ERROR: config JSON path does not exist")
            else:
                LOGGER.info("config JSON path checked..")
        if not os.path.exists(config['vis_path']):
            LOGGER.warning(
                "Creating checkpoint path for Tensorboard visualizations at:  {}\n"
                .format(config['vis_path']))
            os.makedirs(config['vis_path'])
        else:
            LOGGER.info("Tensorboard Visualization path checked..")
            LOGGER.info(
                "Cleaning Visualization path of older tensorboard files...\n")
            # shutil.rmtree(config['vis_path'])

        # Print args
        print("\n" + "x" * 50 +
              "\n\nRunning training with the following parameters: \n")
        for key, value in config.items():
            if not key.endswith('transf'):
                print(key + ' : ' + str(value))
        print("\n" + "x" * 50)

        # config['vis_path'] = os.path.join(config['vis_path'], '{}_conf{}'.format(config['pretrained_model_file'], config['confounder_repeat']))
        config['writer'] = SummaryWriter(config['vis_path'])

        set_seed(config['seed'])
        return config
Exemple #3
0
 def add_word(word, weights):
     word_id = len(result.vocab)
     if word in result.vocab:
         logger.warning(
             "duplicate word '%s' in %s, ignoring all but first", word,
             fname)
         return
     if counts is None:
         # most common scenario: no vocab file given. just make up some bogus counts, in descending order
         result.vocab[word] = Vocab(index=word_id,
                                    count=vocab_size - word_id)
     elif word in counts:
         # use count from the vocab file
         result.vocab[word] = Vocab(index=word_id, count=counts[word])
     else:
         # vocab file given, but word is missing -- set count to None (TODO: or raise?)
         logger.warning(
             "vocabulary file is incomplete: '%s' is missing", word)
         result.vocab[word] = Vocab(index=word_id, count=None)
     result.syn0[word_id] = weights
     result.index2word.append(word)
                        help='Hidden size for race and gender')

    args, unparsed = parser.parse_known_args()
    config = args.__dict__
    wandb.config.update(config)
    config['device'] = get_device()
    config['n_classes'] = 2 if config['loss_func'] == 'ce' else 1

    # Check all provided paths:
    if not os.path.exists(config['data_path']):
        raise ValueError("[!] ERROR: Dataset path does not exist")
    else:
        LOGGER.info("Data path checked..")
    if not os.path.exists(config['model_path']):
        LOGGER.warning(
            "Creating checkpoint path for saved models at:  {}\n".format(
                config['model_path']))
        os.makedirs(config['model_path'])
    else:
        LOGGER.info("Model save path checked..")
    if 'config' in config:
        if not os.path.isfile(config['config']):
            raise ValueError("[!] ERROR: config JSON path does not exist")
        else:
            LOGGER.info("config JSON path checked..")
    if not os.path.exists(config['vis_path']):
        LOGGER.warning(
            "Creating checkpoint path for Tensorboard visualizations at:  {}\n"
            .format(config['vis_path']))
        os.makedirs(config['vis_path'])
    else:
Exemple #5
0
def run_classification_task(task: ExperimentTask, cfo: ClassificationOptions,
                            experiment_config: dict):
    helper.set_random_seed()

    args = cfo
    result_filename_tmpl = filename_utils.get_result_filename_for_task(
        task, experiment_config=experiment_config, cfo=cfo)

    result_file = '{}/{}'.format(cfo.results_folder, result_filename_tmpl)
    predictions_file = '{}/{}'.format(cfo.predictions_folder,
                                      result_filename_tmpl)
    classifier_file = '{}/{}'.format(cfo.classifier_folder,
                                     result_filename_tmpl)

    if not cfo.force and os.path.exists(result_file):
        return

    time_checkpoints = {}

    def add_time_checkpoint(name):
        time_checkpoints[name] = time()

    add_time_checkpoint('start')
    X, Y, estimator, param_grid = task.fn()
    add_time_checkpoint('retrieved_data')

    # A good heuristic of whether it's a gram matrix is whether the dimensions are the same
    is_precomputed = isinstance(X, np.ndarray) and X.shape[0] == X.shape[1]

    # This is also a heuristic
    is_dummy = 'classifier__strategy' in param_grid

    # Add classifiers, instantiate transformer classes and merge with experiment config
    param_grid = experiment_helper.prepare_param_grid(task, param_grid,
                                                      experiment_config)

    LOGGER.info('ParamGrid: {}\n\n'.format(
        pipeline_helper.remove_complex_types(param_grid)))

    X_train, Y_train, X_test, Y_test, train_i, test_i = X, Y, [], [], range(
        len(X)), []

    if not is_dummy:  # and cfo.create_predictions:
        # Hold out validation set for predictions
        try:
            X_train, X_test, Y_train, Y_test, train_i, test_i = train_test_split(
                X,
                Y,
                test_size=cfo.prediction_test_size,
                is_precomputed=is_precomputed,
            )
        except Exception as e:
            LOGGER.warning('Could not split dataset for predictions')
            LOGGER.exception(e)

    def get_cv(splits):
        if splits == -1:
            _, _, _, _, X_train_i, X_test_i = train_test_split(
                X_train,
                Y_train,
                test_size=0.33,
                is_precomputed=is_precomputed)
            cv = [(X_train_i, X_test_i)]
        else:
            cv = sklearn.model_selection.StratifiedKFold(
                n_splits=cfo.n_splits,
                shuffle=True,
                random_state=constants.RANDOM_SEED)
        return cv

    add_time_checkpoint('split_data')
    cv = get_cv(cfo.n_splits)

    should_refit = np.all([
        #not cfo.use_nested_cross_validation,
        not is_dummy,
        #cfo.create_predictions or cfo.save_best_clf
    ])

    gscv = GridSearchCV(estimator=estimator,
                        param_grid=param_grid,
                        cv=cv,
                        scoring=cfo.scoring,
                        n_jobs=cfo.n_jobs,
                        verbose=cfo.verbose,
                        refit=cfo.refit if should_refit else False)

    if cfo.use_nested_cross_validation and not is_dummy:
        cv_nested = get_cv(cfo.n_splits_nested)

        LOGGER.info('Using nested cross-validation')

        scores = sklearn.model_selection.cross_validate(
            gscv,
            X,
            Y,
            scoring=cfo.scoring,
            cv=cv_nested,
            n_jobs=cfo.n_jobs_outer,
            verbose=cfo.verbose,
            return_train_score=True)
        result = dict(scores, **param_grid)
        add_time_checkpoint('fitted_nested')
        results_helper.save_results(result,
                                    result_file,
                                    args,
                                    time_checkpoints=time_checkpoints)
        return

    gscv_result = gscv.fit(X_train, Y_train)
    add_time_checkpoint('fitted_gridsearch')

    if not is_dummy and cfo.create_predictions:
        if not len(X_test):
            LOGGER.warning('Validation set for prediction has no items')
        else:
            try:
                # Retrain the best classifier and get prediction on validation set
                Y_test_pred = gscv_result.best_estimator_.predict(X_test)
                add_time_checkpoint('predicted')
                results_helper.save_results(
                    {
                        'gscv_result':
                        remove_coefs_from_results(gscv_result.cv_results_),
                        'all_params':
                        remove_coefs_from_results(param_grid),
                        'best_params':
                        remove_coefs_from_results(gscv_result.best_params_),
                        'Y_real':
                        Y_test,
                        'Y_pred':
                        Y_test_pred,
                        'X_test':
                        X_test,
                    },
                    predictions_file,
                    args,
                    time_checkpoints=time_checkpoints)
            except Exception as e:
                LOGGER.warning('Error while trying to retrain best classifier')
                LOGGER.exception(e)

    if cfo.save_best_clf:
        best_estimator = gscv_result.best_estimator_
        try:
            results_helper.save_results(
                {
                    'params': gscv_result.best_params_,
                    'classifier': best_estimator
                },
                classifier_file,
                args,
                time_checkpoints=time_checkpoints)
        except Exception as e:
            LOGGER.warning('Error while saving best estimator: {}'.format(e))
            LOGGER.exception(e)

    add_time_checkpoint('finished')
    results_helper.save_results(gscv_result.cv_results_,
                                result_file,
                                args,
                                time_checkpoints=time_checkpoints)
Exemple #6
0
        '--model',
        type=str,
        default="BERT",
        help='Name of the model to use (BERT, RoBERTa, ELECTRA, ALBERT)')
    parser.add_argument('--lr_head',
                        type=float,
                        default=1e-4,
                        help='Learning rate for the MLP head')
    parser.add_argument('--num_layers_freeze',
                        type=int,
                        default=0,
                        help='Number of layers to freeze in BERT')

    args, unparsed = parser.parse_known_args()
    if len(unparsed) > 0:
        LOGGER.warning("There have been unprocessed parser arguments: " +
                       str(unparsed))
    config = args.__dict__
    config = TrainerTemplate.preprocess_args(config)
    # config['no_model_checkpoints'] = (config['no_model_checkpoints'] or config['debug'])
    config['model'] = config['model'].lower()

    assert config[
        'model'] in MODEL_DICT, "Given model is not known. Please choose between the following: " + str(
            MODEL_DICT.keys())
    config['model'] = MODEL_DICT[config['model']]
    # Tokenize
    tokenizer = config['model']["tokenizer"].from_pretrained(
        config['model']['pretrain'])
    tokenizer_func = partial(tokenizer,
                             max_length=config['max_txt_len'],
                             padding='longest',
def get_results(folder=None,
                results_directory=constants.RESULTS_FOLDER,
                log_progress=tqdm.tqdm_notebook,
                exclude_filter=None,
                include_filter=None,
                remove_split_cols=True,
                remove_rank_cols=True,
                remove_fit_time_cols=True,
                filter_out_experiment=None,
                ignore_experiments=True,
                only_load_dataset=None,
                fetch_predictions=False):
    '''
    Retrieves results from result folder.

    Note: This function _seriously_ has to be refactored!

    Args:
        folder: specify the results folder. If not specified, defaults to the most recent results folder
        results_directory: the base folder
        log_progress: function to log the progess. Takes an iterable and yields the item
        exclude_filter: which files to exclude
        include_filter: which files to include
        remove_split_cols: whether to keep the individual results for each split in CV
        remove_rank_cols: whether to keep the rank information in the CV results
        remove_fit_time_cols: keep the fit time
        filter_out_experiment: string thats gets filtered out
        ignore_experiments:
        only_load_dataset: filter the dataset
        fetch_predictions: whether to also retrieve the predictions and calculate the results on them

    Returns:
        pd.DataFrame: the results
    '''
    result_folders = get_result_folders(results_directory)

    folder = 'data/results/{}'.format(folder) if folder else result_folders[-1]

    result_files = get_result_filenames_from_folder(folder)

    if filter_out_experiment:
        result_files = [
            x for x in result_files
            if _get_experiment_name_from_filename(x) == filter_out_experiment
        ]

    if ignore_experiments and not filter_out_experiment:
        result_files = [x for x in result_files if 'experiment_' not in x]

    if only_load_dataset is not None:
        result_files = [
            x for x in result_files
            if filename_utils.get_dataset_from_filename(x) in only_load_dataset
        ]

    data_ = []
    for result_file in log_progress(
            result_files) if log_progress else result_files:
        if include_filter and include_filter not in result_file: continue
        if exclude_filter and exclude_filter in result_file: continue

        if '_nested_' in result_file:
            LOGGER.warning(
                'Encountered nested CV result file. Currently not implemented. File: {}'
                .format(result_file))
            continue

        dataset_name = filename_utils.get_dataset_from_filename(result_file)

        with open(result_file, 'rb') as f:
            result_data = pickle.load(f)

        remove_transformer_classes(result_data)

        result_file = filename_utils.get_filename_only(result_file)
        result = result_data if 'params' in result_data else result_data[
            'results']
        assert 'params' in result

        result = clean_result_keys(result)
        for idx, el in enumerate(result['params']):
            result['params'][idx] = clean_result_keys(el)

        prediction_file = '{}/predictions/{}'.format(
            folder, filename_utils.get_filename_only(result_file))
        predictions_exist = os.path.exists(prediction_file)

        num_results = len(result['params'])
        result['prediction_file_exists'] = [predictions_exist] * num_results

        if fetch_predictions and not predictions_exist:
            LOGGER.warning(
                'fetch_predictions=True but could not find prediction: {}'.
                format(prediction_file))

        # Fetch predictions and check whether the git commits are the same.
        # Also, calculate the prediction scores
        if fetch_predictions and predictions_exist:
            with open(prediction_file, 'rb') as f:
                r = pickle.load(f)
            result_git_commit = result_data['meta_data']['git_commit']
            git_commit = r['meta_data']['git_commit']
            if not git_commit == result_git_commit:
                LOGGER.warning(
                    'Unmatching git commit for prediction/result file! Prediction: {}, Result: {}'
                    .format(git_commit, result_git_commit))
            else:
                prediction = r['results']
                Y_real, Y_pred, X_test = prediction['Y_real'], prediction[
                    'Y_pred'], prediction['X_test']
                scores = calculate_scores(Y_real, Y_pred)
                for name, val in scores.items():
                    result['prediction_score_{}'.format(
                        name)] = [val] * num_results
                result['prediction_file'] = [prediction_file] * num_results

        def is_graph_dataset():
            graph_file_types = [
                constants.TYPE_CONCEPT_MAP, constants.TYPE_COOCCURRENCE,
                'graph_extra'
            ]
            is_graph_dataset_ = False
            for x in graph_file_types:
                if '_{}_'.format(x) in result_file:
                    is_graph_dataset_ = True
                    break
            return is_graph_dataset_

        result['combined'] = np.any([
            'graph_combined__dataset_' in result_file,
            'graph_text_combined__dataset_' in result_file
        ])

        # TEXT
        if is_graph_dataset():
            is_cooccurrence_dataset = constants.TYPE_COOCCURRENCE in result_file
            result[
                'type'] = constants.TYPE_COOCCURRENCE if is_cooccurrence_dataset else constants.TYPE_CONCEPT_MAP

            result['lemmatized'] = '_lemmatized_' in result_file
            result['kernel'] = get_kernel_from_filename(result_file)

            # Co-Occurrence
            if is_cooccurrence_dataset:
                parts = re.findall(r'cooccurrence_(.+?)_(.+?)_',
                                   result_file)[0]
                assert len(parts) == 2
                result['window_size'], result['words'] = parts
            # Concept Maps
            else:
                result['words'] = 'concepts'
        # DUMMY
        elif 'dummy' in result_file:
            result['type'] = 'dummy'
            result['words'] = 'dummy'
        # TEXT
        else:
            result['type'] = 'text'
            result['words'] = ['all'] * num_results

        if 'time_checkpoints' in result_data:
            timestamps = result_data['time_checkpoints']
            timestamps = sorted(timestamps.items(), key=lambda x: x[1])

            start = timestamps[0][1]
            end = timestamps[-1][1]

            result['timestamps'] = [timestamps] * num_results
            result['time'] = [end - start] * num_results

        result['filename'] = result_file
        result['dataset'] = dataset_name

        # Add meta data
        info = {}
        if 'results' in result_data:
            info = {
                'info__' + k: v
                for k, v in result_data.get('meta_data', result_data).items()
                if k != 'results'
            }
        result = dict(result,
                      **{k: [v] * num_results
                         for k, v in info.items()})

        data_.append(result)

    df_all = None
    for d in data_:
        result_df = pd.DataFrame(d)
        df_all = result_df if df_all is None else df_all.append(result_df)

    if df_all is None or not len(df_all):
        LOGGER.warning('Did not retrieve results! Aborting')
        return None

    # Remove cols
    df_all = df_all[[
        x for x in df_all.columns.tolist()
        if (not remove_split_cols or not re.match(r'^split\d', x)) and (
            not remove_fit_time_cols or not re.match(r'_time$', x)) and (
                not remove_rank_cols or not re.match(r'rank_', x))
    ]]

    # Change the column order
    prio_columns = ['dataset', 'type', 'combined']
    low_prio_columns = ['params', 'filename'] + [
        c for c in df_all.columns
        if c.startswith('std_') or c.startswith('mean_')
    ]
    columns = df_all.columns.tolist()
    for c in prio_columns + low_prio_columns:
        columns.remove(c)

    return df_all.reset_index(drop=True)[prio_columns + columns +
                                         low_prio_columns]
Exemple #8
0
        '--model',
        type=str,
        default="BERT",
        help='Name of the model to use (BERT, RoBERTa, ELECTRA, ALBERT, ...)')
    parser.add_argument('--lr_head',
                        type=float,
                        default=1e-4,
                        help='Learning rate for the MLP head')
    parser.add_argument('--num_layers_freeze',
                        type=int,
                        default=0,
                        help='Number of layers to freeze in BERT')

    args, unparsed = parser.parse_known_args()
    if len(unparsed) > 0:
        LOGGER.warning("There have been unprocessed parser arguments: " +
                       str(unparsed))
    config = args.__dict__
    config = TrainerTemplate.preprocess_args(config)
    config['model'] = config['model'].lower()

    assert config[
        'model'] in MODEL_DICT, "Given model is not known. Please choose between the following: " + str(
            MODEL_DICT.keys())
    config['model'] = MODEL_DICT[config['model']]
    # Tokenize
    tokenizer = config['model']["tokenizer"].from_pretrained(
        config['model']['pretrain'])
    tokenizer_func = partial(tokenizer,
                             max_length=config['max_txt_len'],
                             padding='longest',
                             truncation=True,