import pandas as pd import torch from scripts.utils import load_embeddings, remove_outliers from sklearn.svm import SVC from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split ter = pd.read_csv("data/en-fr-100/en-fr-100-mt_score.txt", sep='\n', header=None) ter.columns = ['score'] xlm_path = "data/en-fr-100/xlm-embeddings/" features = load_embeddings(xlm_path) # Use non-zero biber dimensions as features use_biber = False if use_biber == True: biber = pd.read_csv("data/en-fr-100/en-fr-100.dim", sep='\t') drop_cols = biber.columns[(biber == 0).sum() > 0.5 * biber.shape[0]] biber.drop(drop_cols, axis=1, inplace=True) features = features.merge(biber, left_index=True, right_index=True) # Join data into single dataframe df = ter.merge(features, left_index=True, right_index=True) # Remove outliers
def sk_classification(rm_out=False): # Load TER scores mt_scores = pd.read_csv("data/en-fr-100/en-fr-100-mt_score.txt", sep='\n', header=None) mt_scores.columns = ['score'] # Load XLM embeddings and join tensors into dataframe emb_path = "data/en-fr-100/xlm-embeddings/" features = load_embeddings(emb_path) # Join data into single dataframe df = pd.concat([mt_scores, features], axis=1) # Remove outliers if rm_out == True: from utils import remove_outliers df = remove_outliers(df, 'score', lq=0.05, uq=0.95) print("data points below 0.05 or above 0.95 quantiles removed") # Classify scores depedning on percentile df["class"] = 1 # average translation df.loc[df["score"] >= df["score"].quantile(0.67), "class"] = 0 # good translation df.loc[df["score"] <= df["score"].quantile(0.33), "class"] = 2 # bad translation # Split data into training and tests sets, set random_state for reproducibility X_train, X_test, y_train, y_test = train_test_split( df.drop(columns=["score", "class"]), df["class"], test_size=0.2, random_state=42) print("running k-neighbors classifier...") results_dict = {} for n in range(3, 32): # Create classifier neigh = KNeighborsClassifier(n_neighbors=n, algorithm='auto') # Fit classifier to train data neigh.fit(X_train, y_train) results_dict[n] = neigh.score(X_test, y_test) results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['kn-score']) max_score = max(results_df['kn-score']) print("maximum score obtained: %0.2f%%" % (max_score * 100)) max_list = results_df.loc[results_df['kn-score'] == max_score] for n in max_list.index: # Create classifier neigh = KNeighborsClassifier(n_neighbors=n, algorithm='auto') # Fit classifier to train data neigh.fit(X_train, y_train) print("\nnumber of neighbours: %d" % n) # Predict using test data y_pred = neigh.predict(X_test) y_pred_prob = pd.DataFrame(neigh.predict_proba(X_test)).round(2) y_pred_prob.columns = ["prob 0", "prob 1", "prob 2"] # Evaluate results diff = { "good translation": 0, "average translation": 1, "bad translation": 2 } y_res = pd.DataFrame(y_pred, columns=['y_pred']) y_res['y_test'] = y_test.values for key in diff.keys(): key_val = y_res.loc[y_res["y_pred"] == diff[key]] print("Accuracy for %s: %0.2f%%" % (key, accuracy_score(key_val["y_test"], key_val["y_pred"]) * 100))