def evaluate_validation_set( model, devset, id2tok, tok2id, TARGET_LABEL, loss_func, final=False ): y_true = list() y_pred = list() total_loss = 0 for batch, targets, lengths, raw_data in create_dataset( devset, id2tok, tok2id, TARGET_LABEL, batch_size=1 ): pred = model(batch.T, lengths) # loss = loss_func(pred, targets) # loss = loss_func(pred.type(torch.FloatTensor), targets.unsqueeze(0).type(torch.FloatTensor)) loss = loss_func( pred.type(torch.FloatTensor), targets.unsqueeze(0).type(torch.FloatTensor) ) # loss = loss_func(pred, targets) y_true += list(targets.int()) # pred_idx = torch.max(pred, 1)[1] # y_pred += list(pred_idx.data.int()) y_pred += [int(pred.float() >= THRESHOLD)] total_loss += loss acc = accuracy_score(y_true, y_pred) print(confusion_matrix(y_true, y_pred)) if final: print(classification_report(y_true, y_pred)) return classification_report(y_true, y_pred, output_dict=True) return ( total_loss.data.float() / len(devset), acc, classification_report(y_true, y_pred, output_dict=True), )
def evaluate_validation_set(model, devset, id2tok, tok2id, label, loss_func, vector_dict, final=False): y_true = list() y_pred = list() total_loss = 0 for batch, targets, lengths, raw_data in create_dataset(devset, id2tok, tok2id, label, batch_size=1): input_vector = torch.FloatTensor( WE.get_sentence_vector(tokenized_sentence(raw_data[0]), vector_dict=vector_dict)) if LABEL_TO_IX[label] in USE_FEATS: input_vector = add_features(input_vector, raw_data[0]) pred = model(input_vector) loss = loss_func(pred.type(torch.FloatTensor), targets.type(torch.FloatTensor)) y_true += list(targets.int()) y_pred += [int(pred.float() >= THRESHOLD)] total_loss += loss acc = accuracy_score(y_true, y_pred) print(confusion_matrix(y_true, y_pred)) if final: print(classification_report(y_true, y_pred)) return classification_report(y_true, y_pred, output_dict=True) return ( total_loss.data.float() / len(devset), acc, classification_report(y_true, y_pred, output_dict=True), )
print(classification_report(y_val, lr_embed_clf.predict(X_val_embeds))) ## Seeing where no prediction was made null_predictions = len( [i for i in lr_embed_clf.predict(X_val_embeds) if not np.any(np.nonzero(i))] ) print(f"{null_predictions} out of {len(y_val)} predictions were null.") dub_ref_model = lr_embed_clf.estimators_[4] vocab, id2tok, tok2id = get_vocab(train_dataset) target_label = "dubious reference" BATCH_SIZE = 1 pred = [] actual = [] vectors = [] for batch, targets, lengths, raw_data in create_dataset( val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE ): actual.append(targets.item()) pred.append(int(predict(dub_ref_model, raw_data[0]))) vectors.append(WE.get_sentence_vector(raw_data[0].lower().split(), vector_dict)) print(classification_report(actual, pred)) plot_confusion_matrix(dub_ref_model, vectors, actual) def analyze_sentence(label_ix, sent, stopwords): # Explaining with SHAP WE.task_data["fake_news"]["train_text"] = [x.lower().split() for x in raw_X_train] WE.task_data["fake_news"]["X_train"] = np.array(X_train_embeds) WE.task_data["fake_news"]["clf"] = lr_embed_clf.estimators_[label_ix] out = WE.analyze_sentence("fake_news", sent, stopwords=stopwords) print("1:")
def compare_pca_features(target_label, save=False): split_vectors = {0: [], 1: []} for batch, targets, lengths, raw_data in create_dataset( val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE ): tokenized = tokenized_sentence(raw_data[0]) vector = torch.tensor(WE.get_sentence_vector(tokenized, vector_dict)) # vector = add_features(vector, raw_data[0], FEAT_ADD_SOFTENER=0.3) split_vectors[targets.item()].append(vector.tolist()) df = pd.DataFrame(columns=["label", "vector"]) for label, vectors in split_vectors.items(): for vector in vectors: df = df.append({"label": label, "vector": vector}, ignore_index=True) pca = PCA(n_components=2) pca_vals = pca.fit_transform(np.array(df["vector"].tolist())) df["pc1"] = [i[0] for i in pca_vals] df["pc2"] = [i[1] for i in pca_vals] fig, ax = plt.subplots(figsize=(10, 7)) sns.scatterplot(x="pc1", y="pc2", hue="label", data=df, s=100) ax.set_xticks([]) ax.set_yticks([]) fig.suptitle(f'"{target_label.title()}" Glove PCA', fontsize=20) # plt.title("Number Count, Year Count") if save: plt.savefig( f"visualizations/{target_label}_glove_plot.png", dpi=400, bbox_inches="tight", pad_inches=0.2, facecolor="w", ) split_vectors = {0: [], 1: []} for batch, targets, lengths, raw_data in create_dataset( val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE ): tokenized = tokenized_sentence(raw_data[0]) vector = torch.tensor(WE.get_sentence_vector(tokenized, vector_dict)) vector = add_features(vector, raw_data[0], FEAT_ADD_SOFTENER=0.3) split_vectors[targets.item()].append(vector.tolist()) df = pd.DataFrame(columns=["label", "vector"]) for label, vectors in split_vectors.items(): for vector in vectors: df = df.append({"label": label, "vector": vector}, ignore_index=True) pca = PCA(n_components=2) pca_vals = pca.fit_transform(np.array(df["vector"].tolist())) df["pc1"] = [i[0] for i in pca_vals] df["pc2"] = [i[1] for i in pca_vals] fig, ax = plt.subplots(figsize=(10, 7)) sns.scatterplot(x="pc1", y="pc2", hue="label", data=df, s=100) ax.set_xticks([]) ax.set_yticks([]) fig.suptitle(f'"{target_label.title()}" Glove PCA with Added Features', fontsize=20) plt.title("Number Count, Year Count") if save: plt.savefig( f"visualizations/{target_label}_glove_plot_added_features.png", dpi=400, bbox_inches="tight", pad_inches=0.2, facecolor="w", ) return pca_vals
def train(run_test=False): results = {} vocab, id2tok, tok2id = get_vocab(train_dataset) embed_weights = None if USE_GLOVE: embed_weights = get_embed_weights(vocab, tok2id) # from scipy.spatial import distance # print(distance.cosine(embed_weights[tok2id['obama']], embed_weights[tok2id['clinton']])) model = LSTMClassifier( VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, bidirectional=False, embed_weights=embed_weights, ) loss_func = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT])) """ Demo of weights in loss function. """ # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1])) # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0])) if glob.glob("models/lstm/*"): model_id = max([int(i[-1]) for i in glob.glob("models/lstm/*")]) + 1 else: model_id = 1 optimizer = optim.Adam(model.parameters(), lr=ALPHA) all_best_f1 = [] for label in LABELS: logger.green(f"Building classifier for {label}...") model.train() best_f1 = 0.0 for epoch in range(NUM_EPOCHS): print() print(f"Epoch: {epoch}") y_true = list() y_pred = list() total_loss = 0 for batch, targets, lengths, raw_data in create_dataset( train_dataset, id2tok, tok2id, label, batch_size=BATCH_SIZE ): pred = model(batch.T, lengths) loss = loss_func( pred.type(torch.FloatTensor), targets.unsqueeze(0).type(torch.FloatTensor), ) optimizer.zero_grad() loss.backward() optimizer.step() y_true += list(targets.int()) # pred_idx = torch.max(pred, 1)[1] # y_pred += list(pred_idx.data.int()) y_pred += [int(pred.float() >= THRESHOLD)] total_loss += loss acc = accuracy_score(y_true, y_pred) val_loss, val_acc, report = evaluate_validation_set( model, val_dataset, id2tok, tok2id, label, loss_func ) print( "Train loss: {} - acc: {} \nValidation loss: {} - acc: {}".format( total_loss.data.float() / len(train_dataset), acc, val_loss, val_acc ) ) val_f1 = report["1"]["f1-score"] if best_f1 < val_f1: logger.green(f"New best F1 score at {val_f1}") best_f1 = val_f1 if not os.path.exists(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}"): Path(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}").mkdir( parents=True, exist_ok=True ) torch.save( model.state_dict(), f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt", ) results[label] = report if os.path.exists( f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ): os.remove( f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ) with open( f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json", "w", ) as f: json.dump(results, f) all_best_f1.append(best_f1) logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}") with open(f"models/lstm/{model_id}/summary.txt", "w") as f: f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n") for ix, score in enumerate(all_best_f1): f.write(f"{ix}: {score} \n") f.write("\n") f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n") f.write(f"ALPHA: {ALPHA}\n") f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n") f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n") f.write(f"DROPOUT: {DROPOUT}\n")
best_label_feat_info[label] = use_feats if re.search(r"(?<=DROPOUT: )\d\.\d", text): dropout_info[label] = float( re.search(r"(?<=DROPOUT: )\d\.\d", text).group()) for label_ix in best_label_scores: model = MLPClassifier( 302 if label_ix in best_label_feat_info[label_ix] else 300, 32, dropout_info[label_ix], ) model.load_state_dict(torch.load(best_label_paths[label_ix])) model.eval() preds = [] actual = [] for batch, targets, lengths, raw_data in create_dataset( test_dataset, id2tok, tok2id, IX_TO_LABEL[label_ix], batch_size=BATCH_SIZE): tokenized = tokenized_sentence(raw_data[0]) vector = torch.tensor(WE.get_sentence_vector(tokenized, vector_dict)) if label_ix in best_label_feat_info[label_ix]: vector = add_features(vector, raw_data[0], FEAT_ADD_SOFTENER=0.3) preds.append(int(model(vector.type(torch.FloatTensor)) > 0.4)) actual.append(targets.item()) print(IX_TO_LABEL[label_ix]) print(classification_report(actual, preds)) print(confusion_matrix(actual, preds)) print()
def train(): results = {} vocab, id2tok, tok2id = get_vocab(train_dataset) if glob.glob("models/mlp/*"): model_id = (max([ int(re.search(r"\d+", i).group()) for i in glob.glob("models/mlp/*") ]) + 1) else: model_id = 1 """ Demo of weights in loss function. """ # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1])) # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0])) vector_dict = WE.get_vector_dict() all_best_f1 = [] for label in LABELS: if LABEL_TO_IX[label] in USE_FEATS: logger.yellow(f"Using additional features for label {label}...") model = MLPClassifier(EMBED_DIM + len(FEATS_TO_ADD), HIDDEN_SIZE, DROPOUT) else: model = MLPClassifier(EMBED_DIM, HIDDEN_SIZE, DROPOUT) loss_func = nn.BCEWithLogitsLoss( pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT])) optimizer = optim.Adam(model.parameters(), lr=ALPHA) logger.green(f"Building classifier for {label}...") model.train() best_f1 = 0.0 for epoch in range(NUM_EPOCHS): print() print(f"Epoch: {epoch}") y_true = list() y_pred = list() total_loss = 0 for batch, targets, lengths, raw_data in create_dataset( train_dataset, id2tok, tok2id, label, batch_size=BATCH_SIZE): tokenized = tokenized_sentence(raw_data[0]) sentence_weights = None if TFIDF_WEIGHTS: sentence_weights = get_tfidf_vals( train_doc_to_tfidf_ix[raw_data[0]]) sentence_weights = [ sentence_weights[tok] if tok in sentence_weights else 0 for tok in tokenized ] input_vector = torch.FloatTensor( WE.get_sentence_vector( tokenized_sentence(raw_data[0]), vector_dict=vector_dict, weights=sentence_weights, )) if LABEL_TO_IX[label] in USE_FEATS: input_vector = add_features(input_vector, raw_data[0]) pred = model(input_vector) loss = loss_func(pred.type(torch.FloatTensor), targets.type(torch.FloatTensor)) optimizer.zero_grad() loss.backward() optimizer.step() y_true += list(targets.int()) y_pred += [int(pred.float() >= THRESHOLD)] total_loss += loss acc = accuracy_score(y_true, y_pred) val_loss, val_acc, report = evaluate_validation_set( model, val_dataset, id2tok, tok2id, label, loss_func, vector_dict) print("Train loss: {} - acc: {} \nValidation loss: {} - acc: {}". format(total_loss.data.float() / len(train_dataset), acc, val_loss, val_acc)) val_f1 = report["1"]["f1-score"] if best_f1 < val_f1: logger.green(f"New best F1 score at {val_f1}") best_f1 = val_f1 if not os.path.exists( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}"): Path(f"models/mlp/{model_id}/{LABEL_TO_IX[label]}").mkdir( parents=True, exist_ok=True) torch.save( model.state_dict(), f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt", ) results[label] = report if os.path.exists( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ): os.remove( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ) with open( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json", "w", ) as f: json.dump(results, f) all_best_f1.append(best_f1) logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}") with open(f"models/mlp/{model_id}/summary.txt", "w") as f: f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n") for ix, score in enumerate(all_best_f1): f.write(f"{ix}: {score} \n") f.write("\n") f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n") f.write(f"ALPHA: {ALPHA}\n") f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n") f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n") f.write(f"DROPOUT: {DROPOUT}\n") f.write(f"TFIDF_WEIGHTS: {TFIDF_WEIGHTS}\n") if FEATS_TO_ADD: f.write(f"FEATS_TO_ADD: {FEATS_TO_ADD}\n") f.write(f"FEAT_ADD_SOFTENER: {FEAT_ADD_SOFTENER}\n") if USE_FEATS: f.write(f"USE_FEATS: {USE_FEATS}\n") mark_best_results()