Exemple #1
0
    def evalulate(epoch):
        model.eval()

        def sub_eval(data_loader):
            gts = []
            preds = []
            for idx, (data, target) in enumerate(data_loader):
                data = data.to(device)
                pred = model.predict(data)
                preds.append(pred.cpu().numpy())
                gts.append(target.numpy())
            preds = np.concatenate(preds, axis=0)
            gts = np.concatenate(gts)
            return preds, gts

        # eval on train
        preds_train, gts_train = sub_eval(train_loader)
        metrics_train = compute_metrics(preds_train, gts_train, num_class)
        logger.info('Train metrics: acc: {:.3%}, class-avg-acc: {:.3%}'.format(
            metrics_train[0], metrics_train[1]))

        # eval on test

        preds_test, gts_test = sub_eval(test_loader)
        metrics_test = compute_metrics(preds_test, gts_test, num_class)
        logger.info('Test Metrics: acc: {:.3%}, class-avg-acc: {:.3%}'.format(
            metrics_test[0], metrics_test[1]))

        vis_all_metrics(metrics_train, metrics_test, epoch, np.arange(num_class), False)
Exemple #2
0
    def test_compute_metrics(self):
        """
        Test sort_by_metrics using dummy metrics m1, m2.
        m1 (m2) gives higher rating to the first (second) object.
        """
        m1 = StationMetric(name="m1", weight=0.3, function_name="metrics.tests.dummy_m1")
        m1.save()
        m2 = ModelMetric(name="m2", weight=0.5, function_name="metrics.tests.dummy_m2")
        m1.save()

        o1 = "1st_object"
        o2 = "2nd_object"

        # compute m1, m2
        rating = compute_metrics([o1, o2], [m1, m2], as_dict=True)
        self.assertTrue(rating[o1] == 1 * 0.3 + 1 * 0.5 and rating[o2] == 1 * 0.3 + 2 * 0.5)

        # m2 raises error, should be skipped (only m1 is computed)
        rating = compute_metrics([o1, o2], [m1, m2], as_dict=True, raise_error=True)
        self.assertTrue(rating[o1] == rating[o2] == 1 * 0.3 + 0 * 0.5)
Exemple #3
0
import TfidfLsa
import os
import numpy as np

from util import compute_metrics, cmp_gt

data_path = "data/heybox/input_data/data.json"
threshold = 0.6 # 0.6 | 0.8
is_cmp = True
lsa_n_components = 200
gt_path = "data/heybox/ground_truth/"
out_path = "data/heybox/cmp/tfidf_lsa_{}/".format(threshold)

np.random.seed(1)

with open(data_path, "r") as f:
    query_data = json.load(f)

deduplication = TfidfLsa.Deduplication(threshold=threshold)
bag, query_items = deduplication.deduplicate(query_data, lsa_n_components)

# test on ground truth
gt_dirs = [d for d in os.listdir(gt_path) if '.txt' in d]
print("gt classes: {}".format(len(gt_dirs)))
print("pred classes: {}".format(len(bag)))
ave_p,ave_r,f1 = compute_metrics(gt_dirs, gt_path, bag, query_items)
print("recall: %f, precision: %f, f1-score: %f" % (ave_r,ave_p,f1))

# output different pred results from ground truth
if is_cmp:
    cmp_gt(bag, gt_dirs, out_path, gt_path, query_items)
Exemple #4
0
def evaluate(Qtest, DTest, **args):
    '''
    @input subset of topics Qtest ⊆ Q, testing document collection Dtest, judgments
    Rtest, and arguments for the classification and retrieval modules
    @behavior evaluates the behavior of the IR system in the presence and absence of
    relevance feedback. In the presence of relevance feedback, training and
    testing functions are called for each topic in Qtest for a more comprehensive assessment
    @output performance statistics regarding the underlying classification system and
    the behavior of the aided IR system
    '''
    RTrain = args.get('RTrain')
    RTest = args.get('RTest')
    trainX = args.get('trainX')
    testX = args.get('testX')
    classifier_type = args.get('classifier_type')
    hyper_parameters = args.get('hyper_parameters')
    Model = dict()
    #metrics
    aidedRanked = dict()
    aidedNonRanked = dict()
    nonAided = dict()
    classifier_metrics = dict()
    #k used for retrieval
    k = args.get('k')
    ranking_type = args.get('ranking_type')
    for topic in Qtest:
        relevant, nonRelevant = getRelevantNonRelevant(topic)
        '''
        Non-Aided IR
        '''
        ranked_docs_names = [
            name for score, name in sorted(zip(testX[topic], DTest[topic]),
                                           key=lambda pair: RRF(pair[0]),
                                           reverse=True)
        ]
        precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec = compute_metrics(
            ranked_docs_names, relevant, nonRelevant, k)
        nonAided[topic] = [
            precision, recall, fscoreVal, precision_recall_curve, bpref,
            avg_prec
        ]
        '''
        Train the model
        '''
        try:
            if (hyper_parameters):
                Model[topic] = training(
                    topic,
                    trainX,
                    RTrain,
                    classifier_type=classifier_type,
                    hyper_paremeters=hyper_parameters[topic])
            else:
                print(hyper_parameters)
                Model[topic] = training(topic,
                                        trainX,
                                        RTrain,
                                        classifier_type=classifier_type,
                                        hyper_paremeters=None)
            aidedRanked[topic], aidedNonRanked[topic], classifier_metrics[
                topic] = Model[topic].evaluate(topic,
                                               DTest,
                                               RTest,
                                               k=k,
                                               testX=testX,
                                               ranking_type=ranking_type,
                                               relevant=relevant,
                                               nonRelevant=nonRelevant)
        except ValueError:
            print(
                "For topic ", topic,
                "the classifier needs samples of at least 2 classes in the data",
                "but the data contains only one class: 1")
            aidedNonRanked[topic] = nonAided[topic]
            aidedRanked[topic] = nonAided[topic]
            #values from Non Aided
            classifier_metrics[topic] = [
                precision, recall, fscoreVal, avg_prec
            ]
    '''
    Calculate Average values for the metrics
    '''
    nonAided['MAP'] = mean([nonAided[topic][5] for topic in nonAided])
    nonAided['Mean BPREF'] = mean(
        [nonAided[topic][4] for topic in nonAided if topic != 'MAP'])

    aidedRanked['MAP'] = mean([aidedRanked[topic][5] for topic in aidedRanked])
    aidedRanked['Mean BPREF'] = mean(
        [aidedRanked[topic][4] for topic in aidedRanked if topic != 'MAP'])

    aidedNonRanked['MAP'] = mean(
        [aidedNonRanked[topic][5] for topic in aidedNonRanked])
    aidedNonRanked['Mean BPREF'] = mean([
        aidedNonRanked[topic][4] for topic in aidedNonRanked if topic != 'MAP'
    ])

    classifier_metrics['MAP'] = mean(
        [classifier_metrics[topic][3] for topic in classifier_metrics])
    return aidedRanked, aidedNonRanked, nonAided, classifier_metrics
Exemple #5
0
#!/usr/bin/env python
import argparse
import codecs
from sklearn.externals import joblib
from util import build_features, test_svm, compute_metrics, test_words

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train NER on SVM')
    parser.add_argument('-t', '--test', default='../data/test-raw/', help='test file')
    parser.add_argument('-r', '--test_ref', default='../data/test/', help='reference labels test file')
    parser.add_argument('-m', '--model', default='../models/trained_full_sfx_pfx_0506_2040.pkl', help='Model Trained with SVM')
    opts = parser.parse_args()
    labels_test = []
    predict_test = []
    features_test = []
    clf = joblib.load(opts.model)
    print "Building Features with Test Set..."
    build_features(opts.test, opts.test_ref, features_test, labels_test,1)
    print "Predict with Trained Model..."
    test_svm(features_test, predict_test, clf)
    print "Compute Metrics..."
    compute_metrics(predict_test, labels_test)
    with codecs.open('../data/ne_words.hi',encoding="utf8",mode="wb") as o_file:
        for ind in range(len(predict_test[0])):
            if predict_test[0][ind]==1:     #or ind in test_words_in_train:
                o_file.write(test_words[ind])
                o_file.write('\n')
    o_file.close()
Exemple #6
0
    def evaluate(self, topic, DTest, RTest, **kwargs):
        k = kwargs.get('k')
        testX = kwargs.get('testX')
        ranking_type = kwargs.get('ranking_type')
        relevant = kwargs.get('relevant')
        nonRelevant = kwargs.get('nonRelevant')
        '''
        Evaluating the classifier
        '''
        docs = np.array(testX[topic])
        feedback = RTest[topic]
        y_scores = self.clf.predict_proba(docs)
        y_pred = self.clf.predict(docs)
        precision, recall, fscore, true_sum = precision_recall_fscore_support(
            feedback, y_pred, average='macro', zero_division=1)
        avg_prec_score = average_precision_score(feedback, y_scores[:, 1])
        classifier_metrics = [precision, recall, fscore, avg_prec_score]
        '''
        Binary retrieval
        '''
        scores_names = zip(y_scores, DTest[topic])
        positive_class_predicted = [
            doc for doc in scores_names if doc[0][1] > 0.5
        ]
        aided_non_ranked_docs_names = [
            doc[1] for doc in positive_class_predicted
        ]
        '''
        Evaluating binary retrieval
        '''
        precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec = compute_metrics(
            aided_non_ranked_docs_names, relevant, nonRelevant, k)
        aidedNonRanked = [
            precision, recall, fscoreVal, precision_recall_curve, bpref,
            avg_prec
        ]
        '''
        Extension towards ranking
        '''
        if ranking_type == 'proba':  #sorts according to probabilities
            aided_ranked_docs_names = [
                x for _, x in sorted(zip(y_scores, DTest[topic]),
                                     key=lambda pair: pair[0][1],
                                     reverse=True)
            ]
        else:  #sort according to the score of the docs classified as positive
            aided_ranked_docs_names = [
                doc[1] for doc in sorted(positive_class_predicted,
                                         key=lambda x: RRF(x[0]),
                                         reverse=True)[:k]
            ]
        '''
        Evaluating Aided IR
        '''
        precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec = compute_metrics(
            aided_ranked_docs_names, relevant, nonRelevant, k)
        aidedRanked = [
            precision, recall, fscoreVal, precision_recall_curve, bpref,
            avg_prec
        ]

        return aidedRanked, aidedNonRanked, classifier_metrics
Exemple #7
0
def evaluate_epoch(data_iter, models, num_personas, gradient_accumulation_steps, device, dataset, epoch, \
    apply_interaction, matching_method, aggregation_method):
    epoch_loss = []
    ok = 0
    total = 0
    recall = []
    MRR = []
    print_every = 1000
    if len(models) == 1:
        if num_personas == 0:
            context_model, response_model = models[0], models[0]
        else:
            context_model, response_model, persona_model = models[0], models[0], models[0]
    if len(models) == 2:
        context_model, response_model = models
    if len(models) == 3:
        context_model, response_model, persona_model = models
    
    for batch_idx, batch in enumerate(data_iter):
        batch = tuple(t.to(device) for t in batch)
        batch_y = {"input_ids": batch[3], "attention_mask": batch[4], "token_type_ids": batch[5]}
        has_persona = len(batch) > 6
        
        # get context embeddings in chunks due to memory constraint
        batch_size = batch[0].shape[0]
        chunk_size = 20
        num_chunks = math.ceil(batch_size/chunk_size)

        if apply_interaction:
            # batch_x_mask = batch[0].ne(0).float()
            # batch_y_mask = batch[3].ne(0).float()
            batch_x_mask = batch[1].float()
            batch_y_mask = batch[4].float()
            
            batch_x_emb = []
            batch_x_pooled_emb = []
            with torch.no_grad():
                for i in range(num_chunks):
                    mini_batch_x = {
                        "input_ids": batch[0][i*chunk_size: (i+1)*chunk_size], 
                        "attention_mask": batch[1][i*chunk_size: (i+1)*chunk_size], 
                        "token_type_ids": batch[2][i*chunk_size: (i+1)*chunk_size]
                        }
                    mini_output_x = context_model(**mini_batch_x)
                    batch_x_emb.append(mini_output_x[0]) # [(chunk_size, seq_len, emb_size), ...]
                    batch_x_pooled_emb.append(mini_output_x[1])
                batch_x_emb = torch.cat(batch_x_emb, dim=0) # (batch_size, seq_len, emb_size)
                batch_x_pooled_emb = torch.cat(batch_x_pooled_emb, dim=0)
                emb_size = batch_x_emb.shape[-1]

            if has_persona:
                # batch_persona_mask = batch[6].ne(0).float()
                batch_persona_mask = batch[7].float()
                batch_persona_emb = []
                batch_persona_pooled_emb = []
                with torch.no_grad():
                    for i in range(num_chunks):
                        mini_batch_persona = {
                            "input_ids": batch[6][i*chunk_size: (i+1)*chunk_size], 
                            "attention_mask": batch[7][i*chunk_size: (i+1)*chunk_size], 
                            "token_type_ids": batch[8][i*chunk_size: (i+1)*chunk_size]
                            }
                        mini_output_persona = persona_model(**mini_batch_persona)

                        # [(chunk_size, emb_size), ...]
                        batch_persona_emb.append(mini_output_persona[0])
                        batch_persona_pooled_emb.append(mini_output_persona[1])

                    batch_persona_emb = torch.cat(batch_persona_emb, dim=0)
                    batch_persona_pooled_emb = torch.cat(batch_persona_pooled_emb, dim=0)

            with torch.no_grad():
                output_y = response_model(**batch_y)
                batch_y_emb = output_y[0]
            batch_size, sent_len, emb_size = batch_y_emb.shape

            # interaction
            # context-response attention
            num_candidates = batch_size
            
            with torch.no_grad():
                # evaluate per example
                logits = []
                for i in range(batch_size):
                    x_emb = batch_x_emb[i:i+1].repeat_interleave(num_candidates, dim=0) # (num_candidates, context_len, emb_size)
                    x_mask = batch_x_mask[i:i+1].repeat_interleave(num_candidates, dim=0) # (batch_size*num_candidates, context_len)
                    persona_emb, persona_mask = None, None
                    if has_persona:
                        persona_emb = batch_persona_emb[i:i+1].repeat_interleave(num_candidates, dim=0)
                        persona_mask = batch_persona_mask[i:i+1].repeat_interleave(num_candidates, dim=0)

                    logits_single = fuse(context_model, matching_method, aggregation_method, \
                        x_emb, batch_y_emb, persona_emb, x_mask, batch_y_mask, persona_mask, 1, num_candidates).reshape(-1)
                    
                    logits.append(logits_single)
                logits = torch.stack(logits, dim=0)
                
                # compute loss
                targets = torch.arange(batch_size, dtype=torch.long, device=batch[0].device)
                loss = F.cross_entropy(logits, targets)

            num_ok = (targets.long() == logits.float().argmax(dim=1)).sum()
            valid_recall, valid_MRR = compute_metrics_from_logits(logits, targets)
        else:
            batch_x_emb = []
            with torch.no_grad():
                for i in range(num_chunks):
                    mini_batch_x = {
                        "input_ids": batch[0][i*chunk_size: (i+1)*chunk_size], 
                        "attention_mask": batch[1][i*chunk_size: (i+1)*chunk_size], 
                        "token_type_ids": batch[2][i*chunk_size: (i+1)*chunk_size]
                        }
                    mini_output_x = context_model(**mini_batch_x)
                    batch_x_emb.append(mini_output_x[0].mean(dim=1)) # [(chunk_size, emb_size), ...]
                batch_x_emb = torch.cat(batch_x_emb, dim=0) # (batch_size, emb_size)
                emb_size = batch_x_emb.shape[-1]

            if has_persona:
                batch_persona_emb = []
                with torch.no_grad():
                    for i in range(num_chunks):
                        mini_batch_persona = {
                            "input_ids": batch[6][i*chunk_size: (i+1)*chunk_size], 
                            "attention_mask": batch[7][i*chunk_size: (i+1)*chunk_size], 
                            "token_type_ids": batch[8][i*chunk_size: (i+1)*chunk_size]
                            }
                        mini_output_persona = persona_model(**mini_batch_persona)

                        # [(chunk_size, emb_size), ...]
                        batch_persona_emb.append(mini_output_persona[0].mean(dim=1))
                       
            with torch.no_grad():
                batch_persona_emb = torch.cat(batch_persona_emb, dim=0)
                batch_x_emb = (batch_x_emb + batch_persona_emb)/2
                
                output_y = response_model(**batch_y)
                batch_y_emb = output_y[0].mean(dim=1)

            # compute loss
            loss, num_ok = dot_product_loss(batch_x_emb, batch_y_emb)
            valid_recall, valid_MRR = compute_metrics(batch_x_emb, batch_y_emb)
        
        ok += num_ok.item()
        total += batch[0].shape[0]

        # compute valid recall
        recall.append(valid_recall)
        MRR.append(valid_MRR)

        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        epoch_loss.append(loss.item())

        if batch_idx%print_every == 0:
            cprint("loss: ", np.mean(epoch_loss[-print_every:]))
            cprint("valid recall: ", np.mean(recall[-print_every:], axis=0))
            cprint("valid MRR: ", np.mean(MRR[-print_every:], axis=0))

    acc = ok/total
    # compute recall for validation dataset
    recall = np.mean(recall, axis=0)
    MRR = np.mean(MRR)
    return np.mean(epoch_loss), (acc, recall, MRR)
Exemple #8
0
def run_baseline_multi(input_path: str, setting_keys: List[str] = None):
    # Read settings file
    with open(f'{input_path}') as file:
        settings = json.load(file)

    for setting_key, settings_data in settings.items():
        # Only run the setting if the key is in the list of settings or no setting_keys are provided
        if setting_keys is None:
            pass
        elif setting_keys is not None and setting_key not in setting_keys:
            continue

        # Get name of settings
        settings_name = create_config_key(settings_data)
        # Get the relevant data from the settings
        model = settings_data.get("model")
        vectorization = settings_data.get("vectorization")
        use_description = settings_data.get("use_description")
        train_langs = settings_data.get("train_lang")
        test_langs = settings_data.get("eval_lang")
        category = settings_data.get("category")

        # Create a string of the train languages
        train_langs_str = ", ".join(train_langs)

        # Process the categories separately
        dataset_p = pathlib.Path(input_path).parent.joinpath("datasets")
        train_data_p = dataset_p.joinpath(
            f'multi_class_train_set_{category}.csv')
        test_data_p = dataset_p.joinpath(
            f'multi_class_test_set_{category}.csv')

        # Read the data
        train_data = pd.read_csv(train_data_p)
        test_data = pd.read_csv(test_data_p)

        # Filter the train data:
        train_data = train_data.loc[train_data["lang"].isin(train_langs)]
        # Prepare the train and test data for the experiments and get the mapping of the labels
        train_data, test_data, label_dict_inv = prep_data_multi(
            train_data, test_data, use_description)

        # Compute the feature embedding
        if vectorization == BINARY:
            vectorizer = CountVectorizer(analyzer="word",
                                         encoding='utf-8',
                                         tokenizer=None,
                                         preprocessor=None,
                                         stop_words=None,
                                         ngram_range=(1, 2),
                                         max_features=5000,
                                         binary=True)

            train_data_embeddings = vectorizer.fit_transform(
                train_data['content']).toarray()

        elif vectorization == TFIDF:
            vectorizer = TfidfVectorizer()

            train_data_embeddings = vectorizer.fit_transform(
                train_data['content']).toarray()

        else:
            # Other vectorizations are not implemented
            raise AssertionError

        # Fit the models
        if model == LOGIT:
            est = LogisticRegression()
            # Description needs more time to converge
            parameters = {
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'class_weight': ['balanced'],
                'max_iter': [800],
                'n_jobs': [-2]
            }

        elif model == RAFO:
            est = RandomForestClassifier()
            parameters = {
                'n_estimators': [100],
                'max_features': ['sqrt', 'log2', None],
                'max_depth': [2, 4, 7, 10],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 4, 8],
                'class_weight': ['balanced_subsample'],
                'n_jobs': [-2]
            }

        elif model == SVM:
            est = LinearSVC()
            parameters = {
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'class_weight': ['balanced']
            }
        else:
            # Other models are not implemented
            raise AssertionError

        print(model)

        # Define grid search and fit model
        rs = RandomizedSearchCV(estimator=est,
                                param_distributions=parameters,
                                scoring="f1_macro",
                                cv=5,
                                n_jobs=-2,
                                verbose=1,
                                n_iter=100,
                                refit=True)

        rs.fit(train_data_embeddings, train_data["label"].astype(int))

        # Run predictions
        scores_per_lang = {}
        for lang in test_langs:
            # Subset the test data
            test_data_lang = test_data.loc[test_data['lang'] == lang]

            # Retrieve representations & word co-occurence vectors for test set
            test_data_embeddings_lang = vectorizer.transform(
                test_data_lang['content']).toarray()

            # prediction and computation of metrics to measure performance of model
            pred = rs.best_estimator_.predict(test_data_embeddings_lang)

            # Map the predictions back to cluster ids
            pred_cl_id = np.array([label_dict_inv[x] for x in pred])
            # Map the true labels back to cluster ids
            true_cl_id = test_data_lang["old_label_id"].to_numpy()

            scores_per_lang[lang] = compute_metrics({
                "labels": true_cl_id,
                "predictions": pred_cl_id
            }).get("f1")
            output_and_store_results(settings_data=settings_data,
                                     setting_key=settings_name,
                                     category=category,
                                     train_langs_str=train_langs_str,
                                     lang=lang,
                                     result=scores_per_lang[lang],
                                     all_scores="",
                                     hyperparameters=[rs.best_params_],
                                     input_path=input_path,
                                     predictions=pred_cl_id)
Exemple #9
0
 parser = argparse.ArgumentParser(description='Train NER on SVM')
 parser.add_argument('-t',
                     '--test',
                     default='../data/test-raw/',
                     help='test file')
 parser.add_argument('-r',
                     '--test_ref',
                     default='../data/test/',
                     help='reference labels test file')
 parser.add_argument('-m',
                     '--model',
                     default='../models/trained_full_sfx_pfx_0506_2040.pkl',
                     help='Model Trained with SVM')
 opts = parser.parse_args()
 labels_test = []
 predict_test = []
 features_test = []
 clf = joblib.load(opts.model)
 print "Building Features with Test Set..."
 build_features(opts.test, opts.test_ref, features_test, labels_test, 1)
 print "Predict with Trained Model..."
 test_svm(features_test, predict_test, clf)
 print "Compute Metrics..."
 compute_metrics(predict_test, labels_test)
 with codecs.open('../data/ne_words.hi', encoding="utf8",
                  mode="wb") as o_file:
     for ind in range(len(predict_test[0])):
         if predict_test[0][ind] == 1:  #or ind in test_words_in_train:
             o_file.write(test_words[ind])
             o_file.write('\n')
 o_file.close()
Exemple #10
0
def run_baseline_pair(input_path: str, setting_keys: List[str] = None):
    # Read settings file
    with open(f'{input_path}') as file:
        settings = json.load(file)

    for setting_key, setting_data in settings.items():
        # Only run the setting if the key is in the list of settings or no setting_keys are provided
        if setting_keys is None:
            pass
        elif setting_keys is not None and setting_key not in setting_keys:
            continue

        # Get name of settings
        settings_name = create_config_key(setting_data)
        # Get the relevant data from the settings
        model = setting_data.get("model")
        vectorization = setting_data.get("vectorization")
        dataset_size = setting_data.get("dataset_size")
        use_description = setting_data.get("use_description")
        train_langs = setting_data.get("train_lang")
        test_langs = setting_data.get("eval_lang")
        category = setting_data.get("category")

        # Create a string of the train languages
        train_langs_str = ", ".join(train_langs)

        # Process the categories separately
        dataset_p = pathlib.Path(input_path).parent.joinpath("datasets")
        if dataset_size == SMALL:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{SMALL}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')
        elif dataset_size == MEDIUM:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{MEDIUM}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')
        elif dataset_size == LARGE:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{LARGE}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')
        elif dataset_size == XLARGE:
            train_data_p = dataset_p.joinpath(
                f'pairwise_train_set_{category}_{XLARGE}.csv')
            test_data_p = dataset_p.joinpath(
                f'pairwise_test_set_{category}.csv')

        # Read the data
        train_data = pd.read_csv(train_data_p)
        test_data = pd.read_csv(test_data_p)

        # Filter the train data:
        train_data = train_data.loc[train_data["lang_1"].isin(train_langs)]
        # Prepare the train and test data for the experiments and get the mapping of the labels
        train_data, test_data = prep_data_pair(train_data, test_data,
                                               use_description)

        ## Generate features
        if vectorization == COOC:
            # Generate CooC feature
            contents = train_data['content_1'].append(train_data['content_2'])
            contents = contents.drop_duplicates()

            # Initialize Vectorizer
            cv = CountVectorizer(binary=True,
                                 analyzer='word',
                                 encoding='utf-8',
                                 max_features=5000)

            # Fit Vectorizer
            cv.fit(contents)

            # Retrieve representations & word co-occurence vectors for train set
            cv_content1_train = cv.transform(train_data['content_1']).toarray()
            cv_content2_train = cv.transform(train_data['content_2']).toarray()
            train_data_embeddings = np.multiply(cv_content1_train,
                                                cv_content2_train)

        elif vectorization == MAGELLAN:
            # Retrieve tables A,B,G
            A, B, G = prep_data_pair_mallegan(train_data, use_description)

            # Generate features automatically
            feature_table = em.get_features_for_matching(
                A, B, validate_inferred_attr_types=False)

            # Select the attrs. to be included in the feature vector table
            # Title refers to either the title or the concatenated title and description
            attrs_from_table = ['title_left', 'title_right']

            # Convert the labeled data to feature vectors using the feature table
            H = em.extract_feature_vecs(G,
                                        feature_table=feature_table,
                                        attrs_before=attrs_from_table,
                                        attrs_after='label',
                                        show_progress=False)

            # Replace NA values
            H.fillna(-1, inplace=True)

            # Select attributes which should not be used by the classifier
            attrs_to_be_excluded = []
            attrs_to_be_excluded.extend(['id', 'l_id', 'r_id',
                                         'label'])  # label
            attrs_to_be_excluded.extend(attrs_from_table)

            # Retrieve training data
            train_data_embeddings = H.drop(columns=attrs_to_be_excluded)

            # Normalize features
            normalizer = preprocessing.Normalizer().fit(train_data_embeddings)
            train_data_embeddings = normalizer.transform(train_data_embeddings)

        else:
            # Other vectorizations are not implemented
            raise AssertionError

        # Fit the models
        if model == LOGIT:
            est = LogisticRegression()
            parameters = {
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'class_weight': ['balanced'],
                'max_iter': [5000],
                'n_jobs': [-2]
            }

        elif model == RAFO:
            est = RandomForestClassifier()
            parameters = {
                'n_estimators': [100],
                'max_features': ['sqrt', 'log2', None],
                'max_depth': [2, 4, 7, 10],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 4, 8],
                'class_weight': ['balanced_subsample'],
                'n_jobs': [-2]
            }

        elif model == SVM:
            est = LinearSVC()
            parameters = {
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'class_weight': ['balanced']
            }
        else:
            # Other models are not implemented
            raise AssertionError

        print(model)

        # Define grid search and fit model
        rs = RandomizedSearchCV(estimator=est,
                                param_distributions=parameters,
                                scoring="f1_macro",
                                cv=5,
                                n_jobs=-2,
                                verbose=1,
                                n_iter=100,
                                refit=True)

        rs.fit(train_data_embeddings, train_data["label"].astype(int))

        # Generate list for scores
        scores_per_lang = {}

        ## Run predictions
        # Run predictions for cooc feature
        if vectorization == COOC:
            for lang in test_langs:
                # Subset the test data
                test_data_lang = test_data.loc[test_data['lang_1'] == lang]

                # Retrieve representations & word co-occurence vectors for test set
                cv_content1_test = cv.transform(
                    test_data_lang['content_1']).toarray()
                cv_content2_test = cv.transform(
                    test_data_lang['content_2']).toarray()
                test_data_embeddings_lang = np.multiply(
                    cv_content1_test, cv_content2_test)

                # Prediction and computation of metrics to measure performance of model
                pred = rs.best_estimator_.predict(test_data_embeddings_lang)
                scores_per_lang[lang] = compute_metrics({
                    "labels":
                    test_data_lang["label"],
                    "predictions":
                    pred
                }).get("f1")
                output_and_store_results(setting_data, settings_name, category,
                                         train_langs_str, lang,
                                         scores_per_lang[lang], "",
                                         str(rs.best_params_), input_path,
                                         pred)

        # Run predictions for Magellan features
        elif vectorization == MAGELLAN:
            for lang in test_langs:
                # Subset the test data
                test_data_lang = test_data.loc[test_data['lang_1'] == lang]

                # Retrieve tables A,B,G
                A, B, G = prep_data_pair_mallegan(test_data_lang,
                                                  use_description)

                # Generate features
                # feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
                H = em.extract_feature_vecs(G,
                                            feature_table=feature_table,
                                            attrs_before=attrs_from_table,
                                            attrs_after='label',
                                            show_progress=False)

                # Replace NA values
                H.fillna(-1, inplace=True)

                # Retrieve features
                test_data_embeddings_lang = H.drop(
                    columns=attrs_to_be_excluded)

                # Normalize Features
                test_data_embeddings_lang = normalizer.transform(
                    test_data_embeddings_lang)

                # Prediction and computation of metrics to measure performance of model
                pred = rs.best_estimator_.predict(test_data_embeddings_lang)
                scores_per_lang[lang] = compute_metrics({
                    "labels":
                    test_data_lang["label"],
                    "predictions":
                    pred
                }).get("f1")
                output_and_store_results(setting_data, settings_name, category,
                                         train_langs_str, lang,
                                         scores_per_lang[lang], "",
                                         str(rs.best_params_), input_path,
                                         pred)