コード例 #1
0
def load_train_dataset(dataset, embeddings):
    ling_feat_spmatrix, docids = load_ling_features(dataset,
                                                    training_data_path)

    print('Number of documents with linguistic features: %i' % len(docids))

    data_root_dir = os.path.expanduser(training_data_path)
    csvdirname = os.path.join(data_root_dir,
                              'argument_data/%s-new-CSV/' % dataset)

    print(('Loading train/test data from %s...' % csvdirname))

    person_train = []
    a1_train = []
    a2_train = []
    ids_train = []
    prefs_train = []
    X_a1 = []
    X_a2 = []

    for file_name in listdir(csvdirname):
        if file_name.split('.')[-1] != 'csv':
            print("Skipping files without .csv suffix: %s" % csvdirname + '/' +
                  file_name)
            continue

        Xa1, Xa2, labels, ids, turker_ids, a1, a2 = load_single_file_separate_args(
            csvdirname, file_name, word_to_indices_map, None)

        X_a1.extend(Xa1)
        X_a2.extend(Xa2)

        a1_train.extend(a1)
        a2_train.extend(a2)

        person_train.extend(turker_ids)
        prefs_train.extend(labels)
        ids_train.extend(ids)

    train_ids = np.array([ids_pair.split('_') for ids_pair in ids_train])

    print('No. documents in training set: %i' %
          len(np.unique([train_ids[:, 0], train_ids[:, 1]])))

    a1_train = get_docidxs_from_ids(docids, train_ids[:, 0])
    a2_train = get_docidxs_from_ids(docids, train_ids[:, 1])

    items_feat, uids = concat_feature_sets((a1_train, a2_train), [X_a1, X_a2],
                                           ling_feat_spmatrix, embeddings)

    ndims = items_feat.shape[1]

    return items_feat, ling_feat_spmatrix.shape[1], word_to_indices_map, a1_train, \
           a2_train, prefs_train, ndims
コード例 #2
0
def get_text_from_fold_regression(fold, dataset):
    X_test, _, ids_test, _, test_a = fold

    # Identify the arguments in the false pairs.
    X_test = np.array(X_test)

    _, docids = load_ling_features(dataset)
    testids_a = get_docidxs_from_ids(docids, ids_test)
    _, _, utexts = get_doc_token_seqs((testids_a), [X_test], [test_a])

    return testids_a, utexts
コード例 #3
0
def load_test_dataset(output, embeddings):
    # Load the linguistic features
    print(("Loading linguistic features from %s" % output))
    ling_feat_spmatrix, docids = load_ling_features(
        'new_test_data', output, '', output,
        model.features.shape[1] - len(embeddings[0]))

    print('Loaded libSVM data')

    X = []
    test_ids = []
    a = []

    for file_name in listdir(input_dir):
        if file_name.split('.')[-1] != 'csv':
            print("Skipping files without .csv suffix: %s" % input_dir + '/' +
                  file_name)
            continue

        data = pd.read_csv(os.path.join(input_dir, file_name),
                           delimiter='\t',
                           na_values=[])
        data = data.fillna('N/A')

        ids = data['#id'].values
        a1 = data['argument'].values

        a1_tokens = [
            vocabulary_embeddings_extractor.tokenize(a1_line) for a1_line in a1
        ]
        a1_indices = [[
            word_to_indices_map.get(word, 2) for word in a1_tokens_line
        ] for a1_tokens_line in a1_tokens]
        Xa1 = np.array([[1] + a1_indices_line
                        for a1_indices_line in a1_indices])

        valid_args = np.in1d(ids, docids)
        a1 = a1[valid_args]
        Xa1 = Xa1[valid_args]
        ids = ids[valid_args]

        a.extend(a1)
        X.extend(Xa1)
        test_ids.extend(ids)

    # load the embeddings
    docid_to_idx_map = np.argsort(docids).flatten()
    test_items_feat, uids = concat_feature_sets(
        (test_ids), [X], ling_feat_spmatrix, embeddings, docid_to_idx_map)

    return test_items_feat, uids
コード例 #4
0
def get_text_from_fold(fold, dataset):
    X_test_a1, X_test_a2, _, ids_test, _, test_a1, test_a2 = fold
    test_a1 = np.array(test_a1).flatten()
    test_a2 = np.array(test_a2).flatten()

    # Identify the arguments in the false pairs.
    X_test_a1 = np.array(X_test_a1)
    X_test_a2 = np.array(X_test_a2)

    testids = np.array([ids_pair.split('_') for ids_pair in ids_test])

    _, docids = load_ling_features(dataset)
    testids_a1 = get_docidxs_from_ids(docids, testids[:, 0])
    testids_a2 = get_docidxs_from_ids(docids, testids[:, 1])
    _, _, utexts = get_doc_token_seqs(
        (testids_a1, testids_a2), [X_test_a1, X_test_a2], (test_a1, test_a2))

    return testids_a1, testids_a2, utexts
コード例 #5
0
def compute_max_train_similarity(expt_settings,
                                 method,
                                 ls,
                                 docids,
                                 items_feat,
                                 similarities_all=None):
    '''
    Find the maximum cosine similarity for arguments in the dataset.

    Compute the mean/variance of the max similarity for correct/incorrect pairs.
    '''
    # Load the results for GPPL with Ling.
    expt_settings_1 = expt_settings.copy()
    expt_settings_1['method'] = method
    expt_settings_1['feature_type'] = 'ling'
    expt_settings_1['embeddings_type'] = ''

    data_root_dir = os.path.expanduser("~/data/personalised_argumentation/")
    resultsfile_template = 'habernal_%s_%s_%s_%s_acc%.2f_di%.2f'

    resultsdir_1 = get_results_dir(data_root_dir, resultsfile_template,
                                   expt_settings_1)

    # Load the results for GPPL with Glove.

    nFolds = len(list(folds.keys()))

    mean_false = 0
    mean_true = 0
    var_false = 0
    var_true = 0

    if similarities_all is None:
        feats = items_feat / ls[None, :]
        similarities_all = cosine_similarity(
            feats, feats, dense_output=True
        )  #matern_3_2_from_raw_vals(items_feat, ls, items_feat)

    total_count_true = 0
    total_count_false = 0

    for f in range(nFolds):
        fold = list(folds.keys())[f]

        if 'fold_order' in expt_settings_1 and expt_settings_1[
                'fold_order'] is not None:
            f1 = np.argwhere(
                np.array(expt_settings_1['fold_order']) == fold)[0][0]
        else:
            f1 = f

        foldfile = resultsdir_1 + '/fold%i.pkl' % f1
        if os.path.isfile(foldfile):
            with open(foldfile, 'rb') as fh:
                data_1 = pickle.load(fh, encoding='latin1')

        # Load the ground truth classifications
        gold_disc_1, pred_disc_1, _, _, _, _, _, _, _ = get_fold_data(
            data_1, f1, expt_settings_1)

        # Identify the falsely classified pairs with Ling
        #gold_disc_1 = gold_disc_1[:, None]
        #gold_disc_2 = gold_disc_2[:, None]
        if expt_settings_1['method'] == 'SVM':
            pred_disc_1 = pred_disc_1[:, 1]
        pred_disc_1 = pred_disc_1.flatten()
        false_pairs_1 = pred_disc_1 != gold_disc_1
        true_pairs_1 = pred_disc_1 == gold_disc_1

        # Get the argument IDs for this fold
        X_test_a1, X_test_a2, _, ids_test, _, test_a1, test_a2 = folds.get(
            fold)["test"]
        test_a1 = np.array(test_a1)[:, None]
        test_a2 = np.array(test_a2)[:, None]

        testids = np.array([ids_pair.split('_') for ids_pair in ids_test])
        X_test_a1 = np.array(X_test_a1)
        X_test_a2 = np.array(X_test_a2)

        testids_a1 = get_docidxs_from_ids(docids, testids[:, 0])
        testids_a2 = get_docidxs_from_ids(docids, testids[:, 1])

        X_tr_a1, X_tr_a2, _, ids_tr, _, tr_a1, tr_a2 = folds.get(
            fold)["training"]
        tr_a1 = np.array(tr_a1)[:, None]
        tr_a2 = np.array(tr_a2)[:, None]

        trids = np.array([ids_pair.split('_') for ids_pair in ids_tr])
        X_tr_a1 = np.array(X_tr_a1)
        X_tr_a2 = np.array(X_tr_a2)

        _, docids = load_ling_features(expt_settings_1['dataset'])
        trids_a1 = get_docidxs_from_ids(docids, trids[:, 0])
        trids_a2 = get_docidxs_from_ids(docids, trids[:, 1])

        true_similarities = similarities_all[np.concatenate((testids_a1[true_pairs_1], testids_a2[true_pairs_1])), :]\
                                [:, np.concatenate((trids_a1, trids_a2))]
        true_similarities = np.max(true_similarities, axis=1)

        false_similarities = similarities_all[np.concatenate((testids_a1[false_pairs_1], testids_a2[false_pairs_1])), :]\
                                [:, np.concatenate((trids_a1, trids_a2))]
        false_similarities = np.max(false_similarities, axis=1)

        total_count_true += np.sum(true_pairs_1) * 2.0
        total_count_false += np.sum(false_pairs_1) * 2.0
        mean_total_sims_true = np.sum(true_similarities)
        mean_total_sims_false = np.sum(false_similarities)
        var_total_sims_true = np.var(true_similarities)
        var_total_sims_false = np.var(false_similarities)

        mean_false += mean_total_sims_false
        mean_true += mean_total_sims_true
        var_false += var_total_sims_false
        var_true += var_total_sims_true

        #print "mean total_similarity for correctly classified pairs: %f (STD %f)" % (mean_total_sims_true,
        #                                                                           np.sqrt(var_total_sims_true))
        #print "mean total_similarity for incorrectly classified pairs: %f (STD %f)" % (mean_total_sims_false,
        #                                                                            np.sqrt(var_total_sims_false))
        sys.stdout.write('.')
        sys.stdout.flush()

    mean_false /= total_count_false
    mean_true /= total_count_true
    var_false /= nFolds
    var_false = np.sqrt(var_false)
    var_true /= nFolds
    var_true = np.sqrt(var_true)

    print((
        "For all folds: mean total_sim for correctly classified pairs: %f (STD %f)"
        % (mean_true, np.sqrt(var_true))))
    print((
        "For all folds: mean total_sim for incorrectly classified pairs: %f (STD %f)"
        % (mean_false, np.sqrt(var_false))))

    return similarities_all
コード例 #6
0
                                    'word_mean')

    # step 2. Inspect the arguments that 'both' gets right and embeddings or ling alone gets wrong. Expect the results
    # to be similar to the same as the previous step.
    print_where_one_right_two_wrong(expt_settings, 'embeddings', 'both',
                                    'word_mean', 'word_mean')
    print_where_one_right_two_wrong(expt_settings, 'ling', 'both', '',
                                    'word_mean')

    print_where_one_right_two_wrong(expt_settings, 'both', 'embeddings',
                                    'word_mean', 'word_mean')
    print_where_one_right_two_wrong(expt_settings, 'both', 'ling', 'word_mean',
                                    '')

    # Step 3: Compare GPPL to SVM to see which handles outliers better given same features
    ling_feat_spmatrix, docids = load_ling_features(expt_settings['dataset'])

    if 'ls' not in globals():
        ls = compute_lengthscale_heuristic('ling',
                                           '',
                                           None,
                                           ling_feat_spmatrix,
                                           docids,
                                           folds,
                                           None,
                                           multiply_heuristic_power=0.5)
    items_feat = ling_feat_spmatrix.toarray()

    if 'similarity' not in globals():
        similarity = None
    similarity = compute_max_train_similarity(expt_settings,
コード例 #7
0
def get_fold_data(data, f, expt_settings, flip_labels=False):
    # discrete labels are 0, 1 or 2
    try:
        if len(data[3][f]):
            gold_disc = np.array(data[3][f])

            pred_disc = np.array(data[1][f]) * 2
            if pred_disc.ndim == 1:
                pred_disc = pred_disc[:, np.newaxis]
            #if expt_settings['method'] == 'SVM':
            if flip_labels:
                pred_disc = 2 - pred_disc

            # probabilities
            gold_prob = gold_disc / 2.0
            pred_prob = np.array(data[0][f])
            if pred_prob.ndim == 1:
                pred_prob = pred_prob[:, np.newaxis]
            #if expt_settings['method'] == 'SVM':
            if flip_labels:
                pred_prob = 1 - pred_prob

            # scores used to rank
            if len(data[4]) > 0:
                gold_rank = np.array(data[4][f])
            else:
                gold_rank = None

            if len(data[2]) > 0:
                pred_rank = np.array(data[2][f])

                if pred_rank.ndim == 1:
                    pred_rank = pred_rank[:, np.newaxis]

                if flip_labels:
                    pred_rank = -pred_rank
            else:
                gold_rank = None
                pred_rank = None

            if len(data) > 8 and data[8] is not None and f in data[8] and data[
                    8][f] is not None:
                pred_tr_disc = np.round(np.array(data[8][f])) * 2
                pred_tr_prob = np.round(np.array(data[8][f]))
                if flip_labels:
                    pred_tr_disc = 2 - pred_tr_disc
                    pred_tr_prob = 1 - pred_tr_prob
            else:
                pred_tr_disc = None
                pred_tr_prob = None

        else:
            raise Exception('Data not found')
    except:
        gold_disc = np.array(data[3])
        pred_disc = np.array(data[1]) * 2
        if pred_disc.ndim == 1:
            pred_disc = pred_disc[:, np.newaxis]
        #if expt_settings['method'] == 'SVM':
        #    pred_disc = 2 - pred_disc

        # probabilities
        gold_prob = gold_disc / 2.0
        pred_prob = np.array(data[0])
        if pred_prob.ndim == 1:
            pred_prob = pred_prob[:, np.newaxis]
        #if expt_settings['method'] == 'SVM':
        #    pred_prob = 1 - pred_prob

        # scores used to rank
        if data[4] is not None and len(data[4]) > 0:
            gold_rank = np.array(data[4])
        else:
            gold_rank = None

        if data[2] is not None and (len(data[2]) > 0
                                    or data[2].item() is not None):
            pred_rank = np.array(data[2])

            if pred_rank.ndim == 1:
                pred_rank = pred_rank[:, np.newaxis]
        else:
            gold_rank = None
            pred_rank = None

        if len(data) > 8 and data[8] is not None:
            pred_tr_disc = np.round(np.array(data[8])) * 2
            pred_tr_prob = np.round(np.array(data[8]))
        else:
            pred_tr_disc = None
            pred_tr_prob = None

        if flip_labels:
            pred_disc = 2 - pred_disc

            pred_prob = 1 - pred_prob
            if pred_rank is not None:
                pred_rank = -pred_rank
            if pred_tr_disc is not None:
                pred_tr_disc = 2 - pred_tr_disc
            if pred_tr_prob is not None:
                pred_tr_prob = 1 - pred_tr_prob

        #any postprocessing e.g. to remove errors when saving data
        postprocced = False

        if pred_rank is not None and gold_rank is not None and pred_rank.shape[
                0] == 1052 and gold_rank.shape[0] != 1052:
            # we predicted whole dataeset instead of the subset
            from tests import get_fold_regression_data
            if expt_settings['fold_order'] is not None:
                fold = expt_settings['fold_order'][f]
            else:
                fold = list(expt_settings['folds'].keys())[f]
            _, docids = load_ling_features(expt_settings['dataset'])
            _, _, _, item_idx_ranktest, _, _ = get_fold_regression_data(
                expt_settings['folds_regression'], fold, docids)
            pred_rank = pred_rank[item_idx_ranktest, :]
            postprocced = True
            print("Postprocessed: %i, %i" %
                  (pred_rank.shape[0], gold_rank.shape[0]))

    # Considering only the labels where a confident prediction has been made... In this case the metrics should be
    # shown alongside coverage.
    #     gold_disc = gold_disc[np.abs(pred_prob.flatten() - 0.5) > 0.3]
    #     pred_disc = pred_disc[np.abs(pred_prob.flatten() - 0.5) > 0.3]
    #
    #     gold_prob = gold_prob[np.abs(pred_prob.flatten() - 0.5) > 0.3]
    #     pred_prob = pred_prob[np.abs(pred_prob.flatten() - 0.5) > 0.3]

    return gold_disc, pred_disc, gold_prob, pred_prob, gold_rank, pred_rank, pred_tr_disc, pred_tr_prob, postprocced
コード例 #8
0
def compute_metrics(expt_settings,
                    methods,
                    datasets,
                    feature_types,
                    embeddings_types,
                    accuracy=1.0,
                    di=0,
                    npairs=0,
                    tag='',
                    remove_seen_from_mean=False,
                    max_no_folds=32,
                    min_folds_desired=0,
                    compute_tr_performance=False,
                    flip_labels=[]):

    expt_settings['acc'] = accuracy
    expt_settings['di'] = di

    row_index = np.zeros(len(methods) * len(datasets), dtype=object)
    columns = np.zeros(len(feature_types) * len(embeddings_types),
                       dtype=object)

    row = 0

    if expt_settings['di'] == 0 or np.ceil(
            np.float(npairs) / np.float(expt_settings['di'])) == 0:
        AL_rounds = np.array([0]).astype(int)
    else:
        AL_rounds = np.arange(expt_settings['di'],
                              npairs + expt_settings['di'],
                              expt_settings['di'],
                              dtype=int)
        #np.arange( np.ceil(np.float(npairs) / np.float(expt_settings['di'])), dtype=int)

    if tag == '':
        ts = time.time()
        tag = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H-%M-%S')

    for d, dataset_next in enumerate(datasets):

        docids = None

        if expt_settings['dataset'] != dataset_next or expt_settings[
                'folds'] is None:
            expt_settings['dataset'] = dataset_next
            expt_settings['folds'], expt_settings[
                'folds_regression'], _, _, _ = load_train_test_data(
                    expt_settings['dataset'])

        for m, expt_settings['method'] in enumerate(methods):

            if d == 0 and m == 0:

                if expt_settings['di'] == 0:
                    results_shape = (len(methods) * len(datasets),
                                     len(feature_types) *
                                     len(embeddings_types),
                                     len(expt_settings['folds']) + 1, 1)
                else:
                    results_shape = (len(methods) * len(datasets),
                                     len(feature_types) *
                                     len(embeddings_types),
                                     len(expt_settings['folds']) + 1,
                                     int(npairs / expt_settings['di']))

                results_f1 = np.zeros(results_shape)
                results_acc = np.zeros(results_shape)
                results_logloss = np.zeros(results_shape)
                results_auc = np.zeros(results_shape)

                results_pearson = np.zeros(results_shape)
                results_spearman = np.zeros(results_shape)
                results_kendall = np.zeros(results_shape)

                tr_results_f1 = np.zeros(results_shape)
                tr_results_acc = np.zeros(results_shape)
                tr_results_logloss = np.zeros(results_shape)
                tr_results_auc = np.zeros(results_shape)

            row_index[row] = expt_settings['method'] + ', ' + expt_settings[
                'dataset']
            col = 0

            for expt_settings['feature_type'] in feature_types:
                if expt_settings['feature_type'] == 'ling':
                    embeddings_to_use = ['']
                else:
                    embeddings_to_use = embeddings_types
                for expt_settings['embeddings_type'] in embeddings_to_use:
                    data, nFolds, resultsdir, resultsfile = load_results_data(
                        data_root_dir, resultsfile_template, expt_settings,
                        max_no_folds)

                    min_folds = min_folds_desired

                    for f in range(nFolds):
                        print("Processing fold %i" % f)
                        if expt_settings[
                                'fold_order'] is None:  # fall back to the order on the current machine
                            fold = list(expt_settings['folds'].keys())[f]
                        else:
                            fold = expt_settings['fold_order'][f]
                            if fold[-2] == "'" and fold[0] == "'":
                                fold = fold[1:-2]
                            elif fold[-1] == "'" and fold[0] == "'":
                                fold = fold[1:-1]
                            expt_settings['fold_order'][f] = fold

                        # look for new-style data in separate files for each fold. Prefer new-style if both are found.
                        foldfile = resultsdir + '/fold%i.pkl' % f
                        if os.path.isfile(foldfile):
                            with open(foldfile, 'rb') as fh:
                                data_f = pickle.load(fh, encoding='latin1')
                        else:  # convert the old stuff to new stuff
                            if data is None:
                                min_folds = f + 1
                                print('Skipping fold with no data %i' % f)
                                print("Skipping results for %s, %s, %s, %s" %
                                      (expt_settings['method'],
                                       expt_settings['dataset'],
                                       expt_settings['feature_type'],
                                       expt_settings['embeddings_type']))
                                print(
                                    "Skipped filename was: %s, old-style results file would be %s"
                                    % (foldfile, resultsfile))
                                continue

                            if not os.path.isdir(resultsdir):
                                os.mkdir(resultsdir)
                            data_f = []
                            for thing in data:
                                if f in thing:
                                    data_f.append(thing[f])
                                else:
                                    data_f.append(thing)
                            with open(foldfile, 'wb') as fh:
                                pickle.dump(data_f, fh)

                        gold_disc, pred_disc, gold_prob, pred_prob, gold_rank, pred_rank, pred_tr_disc, \
                                                    pred_tr_prob, postprocced = get_fold_data(data_f, f, expt_settings,
                                                                                          flip_labels=m in flip_labels)
                        if postprocced:  # data was postprocessed and needs saving
                            with open(foldfile, 'wb') as fh:
                                pickle.dump(data_f, fh)
                        if pred_tr_disc is not None:
                            print(
                                str(pred_tr_disc.shape) + ', ' +
                                str(pred_prob.shape) + ', ' +
                                str(pred_tr_disc.shape[0] +
                                    pred_prob.shape[0]))

                        for AL_round, _ in enumerate(AL_rounds):
                            #print "fold %i " % f
                            #print AL_round
                            if AL_round >= pred_disc.shape[1]:
                                continue
                            results_f1[row, col, f, AL_round] = f1_score(
                                gold_disc[gold_disc != 1],
                                pred_disc[gold_disc != 1, AL_round],
                                average='macro')
                            #skip the don't knows
                            results_acc[row, col, f,
                                        AL_round] = accuracy_score(
                                            gold_disc[gold_disc != 1],
                                            pred_disc[gold_disc != 1,
                                                      AL_round])

                            results_logloss[row, col, f, AL_round] = log_loss(
                                gold_prob[gold_disc != 1],
                                pred_prob[gold_disc != 1, AL_round])

                            results_auc[row, col, f, AL_round] = roc_auc_score(
                                gold_prob[gold_disc != 1],
                                pred_prob[gold_disc != 1, AL_round])  # macro

                            if gold_rank is None and expt_settings[
                                    'folds_regression'] is not None:
                                if docids is None:
                                    _, docids = load_ling_features(
                                        expt_settings['dataset'])
                                # ranking data was not saved in original file. Get it from the expt_settings['folds_regression'] here
                                _, rankscores_test, _, _ = expt_settings[
                                    'folds_regression'].get(fold)["test"]
                                gold_rank = np.array(rankscores_test)

                            if gold_rank is not None and pred_rank is not None:
                                results_pearson[row, col, f,
                                                AL_round] = pearsonr(
                                                    gold_rank,
                                                    pred_rank[:, AL_round])[0]
                                results_spearman[row, col, f,
                                                 AL_round] = spearmanr(
                                                     gold_rank,
                                                     pred_rank[:, AL_round])[0]
                                results_kendall[row, col, f,
                                                AL_round] = kendalltau(
                                                    gold_rank,
                                                    pred_rank[:, AL_round])[0]

                            def mean_unseen(result, remove_seen_from_mean):

                                if not remove_seen_from_mean:
                                    return result

                                N = len(gold_tr)
                                Nseen = (AL_round + 1) * expt_settings['di']
                                Nunseen = (N - Nseen)
                                return (result * N - Nseen) / Nunseen

                            if pred_tr_prob is not None and AL_round < pred_tr_disc.shape[
                                    1] and compute_tr_performance:
                                _, _, gold_tr, _, _, _, _ = expt_settings[
                                    'folds'].get(fold)["training"]
                                gold_tr = np.array(gold_tr)

                                if (gold_tr !=
                                        1).shape[0] != pred_tr_disc.shape[0]:
                                    print("Mismatch in fold %s! %i, %i" %
                                          (fold, (gold_tr != 1).shape[0],
                                           pred_tr_disc.shape[0]))

                                gold_tr_prob = gold_tr / 2.0

                                tr_results_f1[
                                    row, col, f, AL_round] = mean_unseen(
                                        f1_score(gold_tr[gold_tr != 1],
                                                 pred_tr_disc[gold_tr != 1,
                                                              AL_round],
                                                 average='macro'),
                                        remove_seen_from_mean)
                                #skip the don't knows
                                tr_results_acc[
                                    row, col, f, AL_round] = mean_unseen(
                                        accuracy_score(
                                            gold_tr[gold_tr != 1],
                                            pred_tr_disc[gold_tr != 1,
                                                         AL_round]),
                                        remove_seen_from_mean)

                                tr_results_logloss[
                                    row, col, f, AL_round] = mean_unseen(
                                        log_loss(
                                            gold_tr_prob[gold_tr != 1],
                                            pred_tr_prob[gold_tr != 1,
                                                         AL_round]),
                                        remove_seen_from_mean)

                                tr_results_auc[
                                    row, col, f, AL_round] = mean_unseen(
                                        roc_auc_score(
                                            gold_tr_prob[gold_tr != 1],
                                            pred_tr_prob[gold_tr != 1,
                                                         AL_round]),
                                        remove_seen_from_mean)
                            elif pred_tr_prob is not None and AL_round >= pred_tr_disc.shape[
                                    1]:
                                tr_results_f1[row, col, f, AL_round] = 1
                                tr_results_acc[row, col, f, AL_round] = 1
                                tr_results_auc[row, col, f, AL_round] = 1
                                tr_results_logloss[row, col, f, AL_round] = 0

                        for AL_round in range(results_f1.shape[3]):
                            foldrange = np.arange(
                                min_folds, max_no_folds
                            )  # skip any rounds that did not complete when taking the mean
                            foldrange = foldrange[results_f1[row, col,
                                                             foldrange,
                                                             AL_round] != 0]

                            results_f1[row, col, -1, AL_round] = np.mean(
                                results_f1[row, col, foldrange, AL_round],
                                axis=0)
                            results_acc[row, col, -1, AL_round] = np.mean(
                                results_acc[row, col, foldrange, AL_round],
                                axis=0)
                            results_logloss[row, col, -1, AL_round] = np.mean(
                                results_logloss[row, col, foldrange, AL_round],
                                axis=0)
                            results_auc[row, col, -1, AL_round] = np.mean(
                                results_auc[row, col, foldrange, AL_round],
                                axis=0)

                            results_pearson[row, col, -1, AL_round] = np.mean(
                                results_pearson[row, col, foldrange, AL_round],
                                axis=0)
                            results_spearman[row, col, -1, AL_round] = np.mean(
                                results_spearman[row, col, foldrange,
                                                 AL_round],
                                axis=0)
                            results_kendall[row, col, -1, AL_round] = np.mean(
                                results_kendall[row, col, foldrange, AL_round],
                                axis=0)

                            tr_results_f1[row, col, -1, AL_round] = np.mean(
                                tr_results_f1[row, col, foldrange, AL_round],
                                axis=0)
                            tr_results_acc[row, col, -1, AL_round] = np.mean(
                                tr_results_acc[row, col, foldrange, AL_round],
                                axis=0)
                            tr_results_logloss[
                                row, col, -1, AL_round] = np.mean(
                                    tr_results_logloss[row, col, foldrange,
                                                       AL_round],
                                    axis=0)
                            tr_results_auc[row, col, -1, AL_round] = np.mean(
                                tr_results_auc[row, col, foldrange, AL_round],
                                axis=0)

                    print('p-values for %s, %s, %s, %s:' %
                          (expt_settings['dataset'], expt_settings['method'],
                           expt_settings['feature_type'],
                           expt_settings['embeddings_type']))

                    print(
                        wilcoxon(results_f1[0, 0, foldrange, AL_round],
                                 results_f1[row, col, foldrange, AL_round])[1])
                    print(
                        wilcoxon(results_acc[0, 0, foldrange, AL_round],
                                 results_acc[row, col, foldrange,
                                             AL_round])[1])
                    print(
                        wilcoxon(
                            results_logloss[0, 0, foldrange, AL_round],
                            results_logloss[row, col, foldrange, AL_round])[1])
                    print(
                        wilcoxon(results_auc[0, 0, foldrange, AL_round],
                                 results_auc[row, col, foldrange,
                                             AL_round])[1])
                    print(
                        wilcoxon(
                            results_pearson[0, 0, foldrange, AL_round],
                            results_pearson[row, col, foldrange, AL_round])[1])
                    print(
                        wilcoxon(
                            results_spearman[0, 0, foldrange, AL_round],
                            results_spearman[row, col, foldrange,
                                             AL_round])[1])
                    print(
                        wilcoxon(
                            results_kendall[0, 0, foldrange, AL_round],
                            results_kendall[row, col, foldrange, AL_round])[1])

                    if row == 0:  # set the column headers
                        columns[col] = expt_settings[
                            'feature_type'] + ', ' + expt_settings[
                                'embeddings_type']

                    col += 1

            row += 1

    combined_labels = []
    for row in row_index:
        for col in columns:
            combined_labels.append(str(row) + '_' + str(col))

    mean_results = []
    mean_results.append(
        collate_AL_results(AL_rounds, results_f1, combined_labels,
                           "Macro-F1 scores for round %i: "))
    mean_results.append(
        collate_AL_results(AL_rounds, results_acc, combined_labels,
                           "Accuracy (excl. don't knows), round %i:")
    )  # for UKPConvArgStrict don't knows are already ommitted)
    mean_results.append(
        collate_AL_results(AL_rounds, results_auc, combined_labels,
                           "AUC ROC, round %i:"))
    #if AUC is higher than accuracy and F1 score, it suggests that decision boundary is not calibrated or that
    #accuracy may improve if we exclude data points close to the decision boundary
    mean_results.append(
        collate_AL_results(AL_rounds, results_logloss, combined_labels,
                           "Cross Entropy classification error, round %i: "))
    #(quality of the probability labels is taken into account)
    mean_results.append(
        collate_AL_results(AL_rounds, results_pearson, combined_labels,
                           "Pearson's r for round %i: "))
    mean_results.append(
        collate_AL_results(AL_rounds, results_spearman, combined_labels,
                           "Spearman's rho for round %i: "))
    mean_results.append(
        collate_AL_results(AL_rounds, results_kendall, combined_labels,
                           "Kendall's tau for round %i: "))

    if np.any(tr_results_acc):
        mean_results.append(
            collate_AL_results(AL_rounds, tr_results_f1, combined_labels,
                               "(TR) Macro-F1 scores for round %i: "))
        mean_results.append(
            collate_AL_results(AL_rounds, tr_results_acc, combined_labels,
                               "(TR) Accuracy for round %i: "))
        mean_results.append(
            collate_AL_results(AL_rounds, tr_results_auc, combined_labels,
                               "(TR) AUC ROC for round %i: "))
        mean_results.append(
            collate_AL_results(AL_rounds, tr_results_logloss, combined_labels,
                               "(TR) Cross Entropy Error for round %i: "))


#     metricsfile = data_root_dir + 'outputdata/expt_root_dir' + \
#                     'metrics_%s.pkl' % (tag)
#     with open(metricsfile, 'w') as fh:
#         pickle.dump((results_f1, results_acc, results_auc, results_logloss, results_pearson, results_spearman,
#                      results_kendall), fh)

# TODO: Correlations between reasons and features?

# TODO: Correlations between reasons and latent argument features found using preference components?

    return results_f1, results_acc, results_auc, results_logloss, results_pearson, results_spearman, results_kendall,\
            tr_results_f1, tr_results_acc, tr_results_auc, tr_results_logloss, mean_results, combined_labels