def evalulate(epoch): model.eval() def sub_eval(data_loader): gts = [] preds = [] for idx, (data, target) in enumerate(data_loader): data = data.to(device) pred = model.predict(data) preds.append(pred.cpu().numpy()) gts.append(target.numpy()) preds = np.concatenate(preds, axis=0) gts = np.concatenate(gts) return preds, gts # eval on train preds_train, gts_train = sub_eval(train_loader) metrics_train = compute_metrics(preds_train, gts_train, num_class) logger.info('Train metrics: acc: {:.3%}, class-avg-acc: {:.3%}'.format( metrics_train[0], metrics_train[1])) # eval on test preds_test, gts_test = sub_eval(test_loader) metrics_test = compute_metrics(preds_test, gts_test, num_class) logger.info('Test Metrics: acc: {:.3%}, class-avg-acc: {:.3%}'.format( metrics_test[0], metrics_test[1])) vis_all_metrics(metrics_train, metrics_test, epoch, np.arange(num_class), False)
def test_compute_metrics(self): """ Test sort_by_metrics using dummy metrics m1, m2. m1 (m2) gives higher rating to the first (second) object. """ m1 = StationMetric(name="m1", weight=0.3, function_name="metrics.tests.dummy_m1") m1.save() m2 = ModelMetric(name="m2", weight=0.5, function_name="metrics.tests.dummy_m2") m1.save() o1 = "1st_object" o2 = "2nd_object" # compute m1, m2 rating = compute_metrics([o1, o2], [m1, m2], as_dict=True) self.assertTrue(rating[o1] == 1 * 0.3 + 1 * 0.5 and rating[o2] == 1 * 0.3 + 2 * 0.5) # m2 raises error, should be skipped (only m1 is computed) rating = compute_metrics([o1, o2], [m1, m2], as_dict=True, raise_error=True) self.assertTrue(rating[o1] == rating[o2] == 1 * 0.3 + 0 * 0.5)
import TfidfLsa import os import numpy as np from util import compute_metrics, cmp_gt data_path = "data/heybox/input_data/data.json" threshold = 0.6 # 0.6 | 0.8 is_cmp = True lsa_n_components = 200 gt_path = "data/heybox/ground_truth/" out_path = "data/heybox/cmp/tfidf_lsa_{}/".format(threshold) np.random.seed(1) with open(data_path, "r") as f: query_data = json.load(f) deduplication = TfidfLsa.Deduplication(threshold=threshold) bag, query_items = deduplication.deduplicate(query_data, lsa_n_components) # test on ground truth gt_dirs = [d for d in os.listdir(gt_path) if '.txt' in d] print("gt classes: {}".format(len(gt_dirs))) print("pred classes: {}".format(len(bag))) ave_p,ave_r,f1 = compute_metrics(gt_dirs, gt_path, bag, query_items) print("recall: %f, precision: %f, f1-score: %f" % (ave_r,ave_p,f1)) # output different pred results from ground truth if is_cmp: cmp_gt(bag, gt_dirs, out_path, gt_path, query_items)
def evaluate(Qtest, DTest, **args): ''' @input subset of topics Qtest ⊆ Q, testing document collection Dtest, judgments Rtest, and arguments for the classification and retrieval modules @behavior evaluates the behavior of the IR system in the presence and absence of relevance feedback. In the presence of relevance feedback, training and testing functions are called for each topic in Qtest for a more comprehensive assessment @output performance statistics regarding the underlying classification system and the behavior of the aided IR system ''' RTrain = args.get('RTrain') RTest = args.get('RTest') trainX = args.get('trainX') testX = args.get('testX') classifier_type = args.get('classifier_type') hyper_parameters = args.get('hyper_parameters') Model = dict() #metrics aidedRanked = dict() aidedNonRanked = dict() nonAided = dict() classifier_metrics = dict() #k used for retrieval k = args.get('k') ranking_type = args.get('ranking_type') for topic in Qtest: relevant, nonRelevant = getRelevantNonRelevant(topic) ''' Non-Aided IR ''' ranked_docs_names = [ name for score, name in sorted(zip(testX[topic], DTest[topic]), key=lambda pair: RRF(pair[0]), reverse=True) ] precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec = compute_metrics( ranked_docs_names, relevant, nonRelevant, k) nonAided[topic] = [ precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec ] ''' Train the model ''' try: if (hyper_parameters): Model[topic] = training( topic, trainX, RTrain, classifier_type=classifier_type, hyper_paremeters=hyper_parameters[topic]) else: print(hyper_parameters) Model[topic] = training(topic, trainX, RTrain, classifier_type=classifier_type, hyper_paremeters=None) aidedRanked[topic], aidedNonRanked[topic], classifier_metrics[ topic] = Model[topic].evaluate(topic, DTest, RTest, k=k, testX=testX, ranking_type=ranking_type, relevant=relevant, nonRelevant=nonRelevant) except ValueError: print( "For topic ", topic, "the classifier needs samples of at least 2 classes in the data", "but the data contains only one class: 1") aidedNonRanked[topic] = nonAided[topic] aidedRanked[topic] = nonAided[topic] #values from Non Aided classifier_metrics[topic] = [ precision, recall, fscoreVal, avg_prec ] ''' Calculate Average values for the metrics ''' nonAided['MAP'] = mean([nonAided[topic][5] for topic in nonAided]) nonAided['Mean BPREF'] = mean( [nonAided[topic][4] for topic in nonAided if topic != 'MAP']) aidedRanked['MAP'] = mean([aidedRanked[topic][5] for topic in aidedRanked]) aidedRanked['Mean BPREF'] = mean( [aidedRanked[topic][4] for topic in aidedRanked if topic != 'MAP']) aidedNonRanked['MAP'] = mean( [aidedNonRanked[topic][5] for topic in aidedNonRanked]) aidedNonRanked['Mean BPREF'] = mean([ aidedNonRanked[topic][4] for topic in aidedNonRanked if topic != 'MAP' ]) classifier_metrics['MAP'] = mean( [classifier_metrics[topic][3] for topic in classifier_metrics]) return aidedRanked, aidedNonRanked, nonAided, classifier_metrics
#!/usr/bin/env python import argparse import codecs from sklearn.externals import joblib from util import build_features, test_svm, compute_metrics, test_words if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train NER on SVM') parser.add_argument('-t', '--test', default='../data/test-raw/', help='test file') parser.add_argument('-r', '--test_ref', default='../data/test/', help='reference labels test file') parser.add_argument('-m', '--model', default='../models/trained_full_sfx_pfx_0506_2040.pkl', help='Model Trained with SVM') opts = parser.parse_args() labels_test = [] predict_test = [] features_test = [] clf = joblib.load(opts.model) print "Building Features with Test Set..." build_features(opts.test, opts.test_ref, features_test, labels_test,1) print "Predict with Trained Model..." test_svm(features_test, predict_test, clf) print "Compute Metrics..." compute_metrics(predict_test, labels_test) with codecs.open('../data/ne_words.hi',encoding="utf8",mode="wb") as o_file: for ind in range(len(predict_test[0])): if predict_test[0][ind]==1: #or ind in test_words_in_train: o_file.write(test_words[ind]) o_file.write('\n') o_file.close()
def evaluate(self, topic, DTest, RTest, **kwargs): k = kwargs.get('k') testX = kwargs.get('testX') ranking_type = kwargs.get('ranking_type') relevant = kwargs.get('relevant') nonRelevant = kwargs.get('nonRelevant') ''' Evaluating the classifier ''' docs = np.array(testX[topic]) feedback = RTest[topic] y_scores = self.clf.predict_proba(docs) y_pred = self.clf.predict(docs) precision, recall, fscore, true_sum = precision_recall_fscore_support( feedback, y_pred, average='macro', zero_division=1) avg_prec_score = average_precision_score(feedback, y_scores[:, 1]) classifier_metrics = [precision, recall, fscore, avg_prec_score] ''' Binary retrieval ''' scores_names = zip(y_scores, DTest[topic]) positive_class_predicted = [ doc for doc in scores_names if doc[0][1] > 0.5 ] aided_non_ranked_docs_names = [ doc[1] for doc in positive_class_predicted ] ''' Evaluating binary retrieval ''' precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec = compute_metrics( aided_non_ranked_docs_names, relevant, nonRelevant, k) aidedNonRanked = [ precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec ] ''' Extension towards ranking ''' if ranking_type == 'proba': #sorts according to probabilities aided_ranked_docs_names = [ x for _, x in sorted(zip(y_scores, DTest[topic]), key=lambda pair: pair[0][1], reverse=True) ] else: #sort according to the score of the docs classified as positive aided_ranked_docs_names = [ doc[1] for doc in sorted(positive_class_predicted, key=lambda x: RRF(x[0]), reverse=True)[:k] ] ''' Evaluating Aided IR ''' precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec = compute_metrics( aided_ranked_docs_names, relevant, nonRelevant, k) aidedRanked = [ precision, recall, fscoreVal, precision_recall_curve, bpref, avg_prec ] return aidedRanked, aidedNonRanked, classifier_metrics
def evaluate_epoch(data_iter, models, num_personas, gradient_accumulation_steps, device, dataset, epoch, \ apply_interaction, matching_method, aggregation_method): epoch_loss = [] ok = 0 total = 0 recall = [] MRR = [] print_every = 1000 if len(models) == 1: if num_personas == 0: context_model, response_model = models[0], models[0] else: context_model, response_model, persona_model = models[0], models[0], models[0] if len(models) == 2: context_model, response_model = models if len(models) == 3: context_model, response_model, persona_model = models for batch_idx, batch in enumerate(data_iter): batch = tuple(t.to(device) for t in batch) batch_y = {"input_ids": batch[3], "attention_mask": batch[4], "token_type_ids": batch[5]} has_persona = len(batch) > 6 # get context embeddings in chunks due to memory constraint batch_size = batch[0].shape[0] chunk_size = 20 num_chunks = math.ceil(batch_size/chunk_size) if apply_interaction: # batch_x_mask = batch[0].ne(0).float() # batch_y_mask = batch[3].ne(0).float() batch_x_mask = batch[1].float() batch_y_mask = batch[4].float() batch_x_emb = [] batch_x_pooled_emb = [] with torch.no_grad(): for i in range(num_chunks): mini_batch_x = { "input_ids": batch[0][i*chunk_size: (i+1)*chunk_size], "attention_mask": batch[1][i*chunk_size: (i+1)*chunk_size], "token_type_ids": batch[2][i*chunk_size: (i+1)*chunk_size] } mini_output_x = context_model(**mini_batch_x) batch_x_emb.append(mini_output_x[0]) # [(chunk_size, seq_len, emb_size), ...] batch_x_pooled_emb.append(mini_output_x[1]) batch_x_emb = torch.cat(batch_x_emb, dim=0) # (batch_size, seq_len, emb_size) batch_x_pooled_emb = torch.cat(batch_x_pooled_emb, dim=0) emb_size = batch_x_emb.shape[-1] if has_persona: # batch_persona_mask = batch[6].ne(0).float() batch_persona_mask = batch[7].float() batch_persona_emb = [] batch_persona_pooled_emb = [] with torch.no_grad(): for i in range(num_chunks): mini_batch_persona = { "input_ids": batch[6][i*chunk_size: (i+1)*chunk_size], "attention_mask": batch[7][i*chunk_size: (i+1)*chunk_size], "token_type_ids": batch[8][i*chunk_size: (i+1)*chunk_size] } mini_output_persona = persona_model(**mini_batch_persona) # [(chunk_size, emb_size), ...] batch_persona_emb.append(mini_output_persona[0]) batch_persona_pooled_emb.append(mini_output_persona[1]) batch_persona_emb = torch.cat(batch_persona_emb, dim=0) batch_persona_pooled_emb = torch.cat(batch_persona_pooled_emb, dim=0) with torch.no_grad(): output_y = response_model(**batch_y) batch_y_emb = output_y[0] batch_size, sent_len, emb_size = batch_y_emb.shape # interaction # context-response attention num_candidates = batch_size with torch.no_grad(): # evaluate per example logits = [] for i in range(batch_size): x_emb = batch_x_emb[i:i+1].repeat_interleave(num_candidates, dim=0) # (num_candidates, context_len, emb_size) x_mask = batch_x_mask[i:i+1].repeat_interleave(num_candidates, dim=0) # (batch_size*num_candidates, context_len) persona_emb, persona_mask = None, None if has_persona: persona_emb = batch_persona_emb[i:i+1].repeat_interleave(num_candidates, dim=0) persona_mask = batch_persona_mask[i:i+1].repeat_interleave(num_candidates, dim=0) logits_single = fuse(context_model, matching_method, aggregation_method, \ x_emb, batch_y_emb, persona_emb, x_mask, batch_y_mask, persona_mask, 1, num_candidates).reshape(-1) logits.append(logits_single) logits = torch.stack(logits, dim=0) # compute loss targets = torch.arange(batch_size, dtype=torch.long, device=batch[0].device) loss = F.cross_entropy(logits, targets) num_ok = (targets.long() == logits.float().argmax(dim=1)).sum() valid_recall, valid_MRR = compute_metrics_from_logits(logits, targets) else: batch_x_emb = [] with torch.no_grad(): for i in range(num_chunks): mini_batch_x = { "input_ids": batch[0][i*chunk_size: (i+1)*chunk_size], "attention_mask": batch[1][i*chunk_size: (i+1)*chunk_size], "token_type_ids": batch[2][i*chunk_size: (i+1)*chunk_size] } mini_output_x = context_model(**mini_batch_x) batch_x_emb.append(mini_output_x[0].mean(dim=1)) # [(chunk_size, emb_size), ...] batch_x_emb = torch.cat(batch_x_emb, dim=0) # (batch_size, emb_size) emb_size = batch_x_emb.shape[-1] if has_persona: batch_persona_emb = [] with torch.no_grad(): for i in range(num_chunks): mini_batch_persona = { "input_ids": batch[6][i*chunk_size: (i+1)*chunk_size], "attention_mask": batch[7][i*chunk_size: (i+1)*chunk_size], "token_type_ids": batch[8][i*chunk_size: (i+1)*chunk_size] } mini_output_persona = persona_model(**mini_batch_persona) # [(chunk_size, emb_size), ...] batch_persona_emb.append(mini_output_persona[0].mean(dim=1)) with torch.no_grad(): batch_persona_emb = torch.cat(batch_persona_emb, dim=0) batch_x_emb = (batch_x_emb + batch_persona_emb)/2 output_y = response_model(**batch_y) batch_y_emb = output_y[0].mean(dim=1) # compute loss loss, num_ok = dot_product_loss(batch_x_emb, batch_y_emb) valid_recall, valid_MRR = compute_metrics(batch_x_emb, batch_y_emb) ok += num_ok.item() total += batch[0].shape[0] # compute valid recall recall.append(valid_recall) MRR.append(valid_MRR) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps epoch_loss.append(loss.item()) if batch_idx%print_every == 0: cprint("loss: ", np.mean(epoch_loss[-print_every:])) cprint("valid recall: ", np.mean(recall[-print_every:], axis=0)) cprint("valid MRR: ", np.mean(MRR[-print_every:], axis=0)) acc = ok/total # compute recall for validation dataset recall = np.mean(recall, axis=0) MRR = np.mean(MRR) return np.mean(epoch_loss), (acc, recall, MRR)
def run_baseline_multi(input_path: str, setting_keys: List[str] = None): # Read settings file with open(f'{input_path}') as file: settings = json.load(file) for setting_key, settings_data in settings.items(): # Only run the setting if the key is in the list of settings or no setting_keys are provided if setting_keys is None: pass elif setting_keys is not None and setting_key not in setting_keys: continue # Get name of settings settings_name = create_config_key(settings_data) # Get the relevant data from the settings model = settings_data.get("model") vectorization = settings_data.get("vectorization") use_description = settings_data.get("use_description") train_langs = settings_data.get("train_lang") test_langs = settings_data.get("eval_lang") category = settings_data.get("category") # Create a string of the train languages train_langs_str = ", ".join(train_langs) # Process the categories separately dataset_p = pathlib.Path(input_path).parent.joinpath("datasets") train_data_p = dataset_p.joinpath( f'multi_class_train_set_{category}.csv') test_data_p = dataset_p.joinpath( f'multi_class_test_set_{category}.csv') # Read the data train_data = pd.read_csv(train_data_p) test_data = pd.read_csv(test_data_p) # Filter the train data: train_data = train_data.loc[train_data["lang"].isin(train_langs)] # Prepare the train and test data for the experiments and get the mapping of the labels train_data, test_data, label_dict_inv = prep_data_multi( train_data, test_data, use_description) # Compute the feature embedding if vectorization == BINARY: vectorizer = CountVectorizer(analyzer="word", encoding='utf-8', tokenizer=None, preprocessor=None, stop_words=None, ngram_range=(1, 2), max_features=5000, binary=True) train_data_embeddings = vectorizer.fit_transform( train_data['content']).toarray() elif vectorization == TFIDF: vectorizer = TfidfVectorizer() train_data_embeddings = vectorizer.fit_transform( train_data['content']).toarray() else: # Other vectorizations are not implemented raise AssertionError # Fit the models if model == LOGIT: est = LogisticRegression() # Description needs more time to converge parameters = { 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight': ['balanced'], 'max_iter': [800], 'n_jobs': [-2] } elif model == RAFO: est = RandomForestClassifier() parameters = { 'n_estimators': [100], 'max_features': ['sqrt', 'log2', None], 'max_depth': [2, 4, 7, 10], 'min_samples_split': [2, 5, 10, 20], 'min_samples_leaf': [1, 2, 4, 8], 'class_weight': ['balanced_subsample'], 'n_jobs': [-2] } elif model == SVM: est = LinearSVC() parameters = { 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight': ['balanced'] } else: # Other models are not implemented raise AssertionError print(model) # Define grid search and fit model rs = RandomizedSearchCV(estimator=est, param_distributions=parameters, scoring="f1_macro", cv=5, n_jobs=-2, verbose=1, n_iter=100, refit=True) rs.fit(train_data_embeddings, train_data["label"].astype(int)) # Run predictions scores_per_lang = {} for lang in test_langs: # Subset the test data test_data_lang = test_data.loc[test_data['lang'] == lang] # Retrieve representations & word co-occurence vectors for test set test_data_embeddings_lang = vectorizer.transform( test_data_lang['content']).toarray() # prediction and computation of metrics to measure performance of model pred = rs.best_estimator_.predict(test_data_embeddings_lang) # Map the predictions back to cluster ids pred_cl_id = np.array([label_dict_inv[x] for x in pred]) # Map the true labels back to cluster ids true_cl_id = test_data_lang["old_label_id"].to_numpy() scores_per_lang[lang] = compute_metrics({ "labels": true_cl_id, "predictions": pred_cl_id }).get("f1") output_and_store_results(settings_data=settings_data, setting_key=settings_name, category=category, train_langs_str=train_langs_str, lang=lang, result=scores_per_lang[lang], all_scores="", hyperparameters=[rs.best_params_], input_path=input_path, predictions=pred_cl_id)
parser = argparse.ArgumentParser(description='Train NER on SVM') parser.add_argument('-t', '--test', default='../data/test-raw/', help='test file') parser.add_argument('-r', '--test_ref', default='../data/test/', help='reference labels test file') parser.add_argument('-m', '--model', default='../models/trained_full_sfx_pfx_0506_2040.pkl', help='Model Trained with SVM') opts = parser.parse_args() labels_test = [] predict_test = [] features_test = [] clf = joblib.load(opts.model) print "Building Features with Test Set..." build_features(opts.test, opts.test_ref, features_test, labels_test, 1) print "Predict with Trained Model..." test_svm(features_test, predict_test, clf) print "Compute Metrics..." compute_metrics(predict_test, labels_test) with codecs.open('../data/ne_words.hi', encoding="utf8", mode="wb") as o_file: for ind in range(len(predict_test[0])): if predict_test[0][ind] == 1: #or ind in test_words_in_train: o_file.write(test_words[ind]) o_file.write('\n') o_file.close()
def run_baseline_pair(input_path: str, setting_keys: List[str] = None): # Read settings file with open(f'{input_path}') as file: settings = json.load(file) for setting_key, setting_data in settings.items(): # Only run the setting if the key is in the list of settings or no setting_keys are provided if setting_keys is None: pass elif setting_keys is not None and setting_key not in setting_keys: continue # Get name of settings settings_name = create_config_key(setting_data) # Get the relevant data from the settings model = setting_data.get("model") vectorization = setting_data.get("vectorization") dataset_size = setting_data.get("dataset_size") use_description = setting_data.get("use_description") train_langs = setting_data.get("train_lang") test_langs = setting_data.get("eval_lang") category = setting_data.get("category") # Create a string of the train languages train_langs_str = ", ".join(train_langs) # Process the categories separately dataset_p = pathlib.Path(input_path).parent.joinpath("datasets") if dataset_size == SMALL: train_data_p = dataset_p.joinpath( f'pairwise_train_set_{category}_{SMALL}.csv') test_data_p = dataset_p.joinpath( f'pairwise_test_set_{category}.csv') elif dataset_size == MEDIUM: train_data_p = dataset_p.joinpath( f'pairwise_train_set_{category}_{MEDIUM}.csv') test_data_p = dataset_p.joinpath( f'pairwise_test_set_{category}.csv') elif dataset_size == LARGE: train_data_p = dataset_p.joinpath( f'pairwise_train_set_{category}_{LARGE}.csv') test_data_p = dataset_p.joinpath( f'pairwise_test_set_{category}.csv') elif dataset_size == XLARGE: train_data_p = dataset_p.joinpath( f'pairwise_train_set_{category}_{XLARGE}.csv') test_data_p = dataset_p.joinpath( f'pairwise_test_set_{category}.csv') # Read the data train_data = pd.read_csv(train_data_p) test_data = pd.read_csv(test_data_p) # Filter the train data: train_data = train_data.loc[train_data["lang_1"].isin(train_langs)] # Prepare the train and test data for the experiments and get the mapping of the labels train_data, test_data = prep_data_pair(train_data, test_data, use_description) ## Generate features if vectorization == COOC: # Generate CooC feature contents = train_data['content_1'].append(train_data['content_2']) contents = contents.drop_duplicates() # Initialize Vectorizer cv = CountVectorizer(binary=True, analyzer='word', encoding='utf-8', max_features=5000) # Fit Vectorizer cv.fit(contents) # Retrieve representations & word co-occurence vectors for train set cv_content1_train = cv.transform(train_data['content_1']).toarray() cv_content2_train = cv.transform(train_data['content_2']).toarray() train_data_embeddings = np.multiply(cv_content1_train, cv_content2_train) elif vectorization == MAGELLAN: # Retrieve tables A,B,G A, B, G = prep_data_pair_mallegan(train_data, use_description) # Generate features automatically feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) # Select the attrs. to be included in the feature vector table # Title refers to either the title or the concatenated title and description attrs_from_table = ['title_left', 'title_right'] # Convert the labeled data to feature vectors using the feature table H = em.extract_feature_vecs(G, feature_table=feature_table, attrs_before=attrs_from_table, attrs_after='label', show_progress=False) # Replace NA values H.fillna(-1, inplace=True) # Select attributes which should not be used by the classifier attrs_to_be_excluded = [] attrs_to_be_excluded.extend(['id', 'l_id', 'r_id', 'label']) # label attrs_to_be_excluded.extend(attrs_from_table) # Retrieve training data train_data_embeddings = H.drop(columns=attrs_to_be_excluded) # Normalize features normalizer = preprocessing.Normalizer().fit(train_data_embeddings) train_data_embeddings = normalizer.transform(train_data_embeddings) else: # Other vectorizations are not implemented raise AssertionError # Fit the models if model == LOGIT: est = LogisticRegression() parameters = { 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight': ['balanced'], 'max_iter': [5000], 'n_jobs': [-2] } elif model == RAFO: est = RandomForestClassifier() parameters = { 'n_estimators': [100], 'max_features': ['sqrt', 'log2', None], 'max_depth': [2, 4, 7, 10], 'min_samples_split': [2, 5, 10, 20], 'min_samples_leaf': [1, 2, 4, 8], 'class_weight': ['balanced_subsample'], 'n_jobs': [-2] } elif model == SVM: est = LinearSVC() parameters = { 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight': ['balanced'] } else: # Other models are not implemented raise AssertionError print(model) # Define grid search and fit model rs = RandomizedSearchCV(estimator=est, param_distributions=parameters, scoring="f1_macro", cv=5, n_jobs=-2, verbose=1, n_iter=100, refit=True) rs.fit(train_data_embeddings, train_data["label"].astype(int)) # Generate list for scores scores_per_lang = {} ## Run predictions # Run predictions for cooc feature if vectorization == COOC: for lang in test_langs: # Subset the test data test_data_lang = test_data.loc[test_data['lang_1'] == lang] # Retrieve representations & word co-occurence vectors for test set cv_content1_test = cv.transform( test_data_lang['content_1']).toarray() cv_content2_test = cv.transform( test_data_lang['content_2']).toarray() test_data_embeddings_lang = np.multiply( cv_content1_test, cv_content2_test) # Prediction and computation of metrics to measure performance of model pred = rs.best_estimator_.predict(test_data_embeddings_lang) scores_per_lang[lang] = compute_metrics({ "labels": test_data_lang["label"], "predictions": pred }).get("f1") output_and_store_results(setting_data, settings_name, category, train_langs_str, lang, scores_per_lang[lang], "", str(rs.best_params_), input_path, pred) # Run predictions for Magellan features elif vectorization == MAGELLAN: for lang in test_langs: # Subset the test data test_data_lang = test_data.loc[test_data['lang_1'] == lang] # Retrieve tables A,B,G A, B, G = prep_data_pair_mallegan(test_data_lang, use_description) # Generate features # feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) H = em.extract_feature_vecs(G, feature_table=feature_table, attrs_before=attrs_from_table, attrs_after='label', show_progress=False) # Replace NA values H.fillna(-1, inplace=True) # Retrieve features test_data_embeddings_lang = H.drop( columns=attrs_to_be_excluded) # Normalize Features test_data_embeddings_lang = normalizer.transform( test_data_embeddings_lang) # Prediction and computation of metrics to measure performance of model pred = rs.best_estimator_.predict(test_data_embeddings_lang) scores_per_lang[lang] = compute_metrics({ "labels": test_data_lang["label"], "predictions": pred }).get("f1") output_and_store_results(setting_data, settings_name, category, train_langs_str, lang, scores_per_lang[lang], "", str(rs.best_params_), input_path, pred)