Example #1
0
def evaluate_validation_set(
    model, devset, id2tok, tok2id, TARGET_LABEL, loss_func, final=False
):
    y_true = list()
    y_pred = list()
    total_loss = 0
    for batch, targets, lengths, raw_data in create_dataset(
        devset, id2tok, tok2id, TARGET_LABEL, batch_size=1
    ):
        pred = model(batch.T, lengths)
        # loss = loss_func(pred, targets)
        # loss = loss_func(pred.type(torch.FloatTensor), targets.unsqueeze(0).type(torch.FloatTensor))
        loss = loss_func(
            pred.type(torch.FloatTensor), targets.unsqueeze(0).type(torch.FloatTensor)
        )
        # loss = loss_func(pred, targets)
        y_true += list(targets.int())
        # pred_idx = torch.max(pred, 1)[1]
        # y_pred += list(pred_idx.data.int())
        y_pred += [int(pred.float() >= THRESHOLD)]
        total_loss += loss
    acc = accuracy_score(y_true, y_pred)
    print(confusion_matrix(y_true, y_pred))
    if final:
        print(classification_report(y_true, y_pred))
        return classification_report(y_true, y_pred, output_dict=True)
    return (
        total_loss.data.float() / len(devset),
        acc,
        classification_report(y_true, y_pred, output_dict=True),
    )
Example #2
0
def evaluate_validation_set(model,
                            devset,
                            id2tok,
                            tok2id,
                            label,
                            loss_func,
                            vector_dict,
                            final=False):
    y_true = list()
    y_pred = list()
    total_loss = 0
    for batch, targets, lengths, raw_data in create_dataset(devset,
                                                            id2tok,
                                                            tok2id,
                                                            label,
                                                            batch_size=1):
        input_vector = torch.FloatTensor(
            WE.get_sentence_vector(tokenized_sentence(raw_data[0]),
                                   vector_dict=vector_dict))
        if LABEL_TO_IX[label] in USE_FEATS:
            input_vector = add_features(input_vector, raw_data[0])
        pred = model(input_vector)
        loss = loss_func(pred.type(torch.FloatTensor),
                         targets.type(torch.FloatTensor))
        y_true += list(targets.int())
        y_pred += [int(pred.float() >= THRESHOLD)]
        total_loss += loss
    acc = accuracy_score(y_true, y_pred)
    print(confusion_matrix(y_true, y_pred))
    if final:
        print(classification_report(y_true, y_pred))
        return classification_report(y_true, y_pred, output_dict=True)
    return (
        total_loss.data.float() / len(devset),
        acc,
        classification_report(y_true, y_pred, output_dict=True),
    )
print(classification_report(y_val, lr_embed_clf.predict(X_val_embeds)))
## Seeing where no prediction was made
null_predictions = len(
    [i for i in lr_embed_clf.predict(X_val_embeds) if not np.any(np.nonzero(i))]
)
print(f"{null_predictions} out of {len(y_val)} predictions were null.")

dub_ref_model = lr_embed_clf.estimators_[4]
vocab, id2tok, tok2id = get_vocab(train_dataset)
target_label = "dubious reference"
BATCH_SIZE = 1
pred = []
actual = []
vectors = []
for batch, targets, lengths, raw_data in create_dataset(
    val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE
):
    actual.append(targets.item())
    pred.append(int(predict(dub_ref_model, raw_data[0])))
    vectors.append(WE.get_sentence_vector(raw_data[0].lower().split(), vector_dict))
print(classification_report(actual, pred))
plot_confusion_matrix(dub_ref_model, vectors, actual)


def analyze_sentence(label_ix, sent, stopwords):
    # Explaining with SHAP
    WE.task_data["fake_news"]["train_text"] = [x.lower().split() for x in raw_X_train]
    WE.task_data["fake_news"]["X_train"] = np.array(X_train_embeds)
    WE.task_data["fake_news"]["clf"] = lr_embed_clf.estimators_[label_ix]
    out = WE.analyze_sentence("fake_news", sent, stopwords=stopwords)
    print("1:")
def compare_pca_features(target_label, save=False):
    split_vectors = {0: [], 1: []}
    for batch, targets, lengths, raw_data in create_dataset(
        val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE
    ):
        tokenized = tokenized_sentence(raw_data[0])
        vector = torch.tensor(WE.get_sentence_vector(tokenized, vector_dict))
        # vector = add_features(vector, raw_data[0], FEAT_ADD_SOFTENER=0.3)
        split_vectors[targets.item()].append(vector.tolist())
    df = pd.DataFrame(columns=["label", "vector"])
    for label, vectors in split_vectors.items():
        for vector in vectors:
            df = df.append({"label": label, "vector": vector}, ignore_index=True)
    pca = PCA(n_components=2)
    pca_vals = pca.fit_transform(np.array(df["vector"].tolist()))
    df["pc1"] = [i[0] for i in pca_vals]
    df["pc2"] = [i[1] for i in pca_vals]
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.scatterplot(x="pc1", y="pc2", hue="label", data=df, s=100)
    ax.set_xticks([])
    ax.set_yticks([])
    fig.suptitle(f'"{target_label.title()}" Glove PCA', fontsize=20)
    # plt.title("Number Count, Year Count")
    if save:
        plt.savefig(
            f"visualizations/{target_label}_glove_plot.png",
            dpi=400,
            bbox_inches="tight",
            pad_inches=0.2,
            facecolor="w",
        )

    split_vectors = {0: [], 1: []}
    for batch, targets, lengths, raw_data in create_dataset(
        val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE
    ):
        tokenized = tokenized_sentence(raw_data[0])
        vector = torch.tensor(WE.get_sentence_vector(tokenized, vector_dict))
        vector = add_features(vector, raw_data[0], FEAT_ADD_SOFTENER=0.3)
        split_vectors[targets.item()].append(vector.tolist())
    df = pd.DataFrame(columns=["label", "vector"])
    for label, vectors in split_vectors.items():
        for vector in vectors:
            df = df.append({"label": label, "vector": vector}, ignore_index=True)
    pca = PCA(n_components=2)
    pca_vals = pca.fit_transform(np.array(df["vector"].tolist()))
    df["pc1"] = [i[0] for i in pca_vals]
    df["pc2"] = [i[1] for i in pca_vals]
    fig, ax = plt.subplots(figsize=(10, 7))
    sns.scatterplot(x="pc1", y="pc2", hue="label", data=df, s=100)
    ax.set_xticks([])
    ax.set_yticks([])
    fig.suptitle(f'"{target_label.title()}" Glove PCA with Added Features', fontsize=20)
    plt.title("Number Count, Year Count")
    if save:
        plt.savefig(
            f"visualizations/{target_label}_glove_plot_added_features.png",
            dpi=400,
            bbox_inches="tight",
            pad_inches=0.2,
            facecolor="w",
        )
    return pca_vals
Example #5
0
def train(run_test=False):
    results = {}
    vocab, id2tok, tok2id = get_vocab(train_dataset)
    embed_weights = None
    if USE_GLOVE:
        embed_weights = get_embed_weights(vocab, tok2id)
    # from scipy.spatial import distance
    # print(distance.cosine(embed_weights[tok2id['obama']], embed_weights[tok2id['clinton']]))
    model = LSTMClassifier(
        VOCAB_SIZE,
        EMBED_DIM,
        HIDDEN_SIZE,
        bidirectional=False,
        embed_weights=embed_weights,
    )
    loss_func = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT]))

    """
    Demo of weights in loss function.
    """
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1]))
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0]))
    if glob.glob("models/lstm/*"):
        model_id = max([int(i[-1]) for i in glob.glob("models/lstm/*")]) + 1
    else:
        model_id = 1

    optimizer = optim.Adam(model.parameters(), lr=ALPHA)
    all_best_f1 = []
    for label in LABELS:
        logger.green(f"Building classifier for {label}...")
        model.train()
        best_f1 = 0.0
        for epoch in range(NUM_EPOCHS):
            print()
            print(f"Epoch: {epoch}")
            y_true = list()
            y_pred = list()
            total_loss = 0
            for batch, targets, lengths, raw_data in create_dataset(
                train_dataset, id2tok, tok2id, label, batch_size=BATCH_SIZE
            ):
                pred = model(batch.T, lengths)
                loss = loss_func(
                    pred.type(torch.FloatTensor),
                    targets.unsqueeze(0).type(torch.FloatTensor),
                )
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                y_true += list(targets.int())
                # pred_idx = torch.max(pred, 1)[1]
                # y_pred += list(pred_idx.data.int())
                y_pred += [int(pred.float() >= THRESHOLD)]
                total_loss += loss
            acc = accuracy_score(y_true, y_pred)
            val_loss, val_acc, report = evaluate_validation_set(
                model, val_dataset, id2tok, tok2id, label, loss_func
            )
            print(
                "Train loss: {} - acc: {} \nValidation loss: {} - acc: {}".format(
                    total_loss.data.float() / len(train_dataset), acc, val_loss, val_acc
                )
            )
            val_f1 = report["1"]["f1-score"]
            if best_f1 < val_f1:
                logger.green(f"New best F1 score at {val_f1}")
                best_f1 = val_f1
                if not os.path.exists(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}"):
                    Path(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}").mkdir(
                        parents=True, exist_ok=True
                    )
                torch.save(
                    model.state_dict(),
                    f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt",
                )
                results[label] = report
                if os.path.exists(
                    f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                ):
                    os.remove(
                        f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                    )
                with open(
                    f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json",
                    "w",
                ) as f:
                    json.dump(results, f)
        all_best_f1.append(best_f1)
    logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}")
    with open(f"models/lstm/{model_id}/summary.txt", "w") as f:
        f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n")
        for ix, score in enumerate(all_best_f1):
            f.write(f"{ix}: {score} \n")
        f.write("\n")
        f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n")
        f.write(f"ALPHA: {ALPHA}\n")
        f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n")
        f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n")
        f.write(f"DROPOUT: {DROPOUT}\n")
Example #6
0
                        best_label_feat_info[label] = use_feats
                    if re.search(r"(?<=DROPOUT: )\d\.\d", text):
                        dropout_info[label] = float(
                            re.search(r"(?<=DROPOUT: )\d\.\d", text).group())
for label_ix in best_label_scores:
    model = MLPClassifier(
        302 if label_ix in best_label_feat_info[label_ix] else 300,
        32,
        dropout_info[label_ix],
    )
    model.load_state_dict(torch.load(best_label_paths[label_ix]))
    model.eval()
    preds = []
    actual = []
    for batch, targets, lengths, raw_data in create_dataset(
            test_dataset,
            id2tok,
            tok2id,
            IX_TO_LABEL[label_ix],
            batch_size=BATCH_SIZE):
        tokenized = tokenized_sentence(raw_data[0])
        vector = torch.tensor(WE.get_sentence_vector(tokenized, vector_dict))
        if label_ix in best_label_feat_info[label_ix]:
            vector = add_features(vector, raw_data[0], FEAT_ADD_SOFTENER=0.3)
        preds.append(int(model(vector.type(torch.FloatTensor)) > 0.4))
        actual.append(targets.item())
    print(IX_TO_LABEL[label_ix])
    print(classification_report(actual, preds))
    print(confusion_matrix(actual, preds))
    print()
Example #7
0
def train():
    results = {}
    vocab, id2tok, tok2id = get_vocab(train_dataset)
    if glob.glob("models/mlp/*"):
        model_id = (max([
            int(re.search(r"\d+", i).group())
            for i in glob.glob("models/mlp/*")
        ]) + 1)
    else:
        model_id = 1
    """
    Demo of weights in loss function.
    """
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1]))
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0]))
    vector_dict = WE.get_vector_dict()
    all_best_f1 = []
    for label in LABELS:
        if LABEL_TO_IX[label] in USE_FEATS:
            logger.yellow(f"Using additional features for label {label}...")
            model = MLPClassifier(EMBED_DIM + len(FEATS_TO_ADD), HIDDEN_SIZE,
                                  DROPOUT)
        else:
            model = MLPClassifier(EMBED_DIM, HIDDEN_SIZE, DROPOUT)
        loss_func = nn.BCEWithLogitsLoss(
            pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT]))
        optimizer = optim.Adam(model.parameters(), lr=ALPHA)
        logger.green(f"Building classifier for {label}...")
        model.train()
        best_f1 = 0.0
        for epoch in range(NUM_EPOCHS):
            print()
            print(f"Epoch: {epoch}")
            y_true = list()
            y_pred = list()
            total_loss = 0
            for batch, targets, lengths, raw_data in create_dataset(
                    train_dataset, id2tok, tok2id, label,
                    batch_size=BATCH_SIZE):
                tokenized = tokenized_sentence(raw_data[0])
                sentence_weights = None
                if TFIDF_WEIGHTS:
                    sentence_weights = get_tfidf_vals(
                        train_doc_to_tfidf_ix[raw_data[0]])
                    sentence_weights = [
                        sentence_weights[tok] if tok in sentence_weights else 0
                        for tok in tokenized
                    ]
                input_vector = torch.FloatTensor(
                    WE.get_sentence_vector(
                        tokenized_sentence(raw_data[0]),
                        vector_dict=vector_dict,
                        weights=sentence_weights,
                    ))
                if LABEL_TO_IX[label] in USE_FEATS:
                    input_vector = add_features(input_vector, raw_data[0])
                pred = model(input_vector)
                loss = loss_func(pred.type(torch.FloatTensor),
                                 targets.type(torch.FloatTensor))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                y_true += list(targets.int())
                y_pred += [int(pred.float() >= THRESHOLD)]
                total_loss += loss
            acc = accuracy_score(y_true, y_pred)
            val_loss, val_acc, report = evaluate_validation_set(
                model, val_dataset, id2tok, tok2id, label, loss_func,
                vector_dict)
            print("Train loss: {} - acc: {} \nValidation loss: {} - acc: {}".
                  format(total_loss.data.float() / len(train_dataset), acc,
                         val_loss, val_acc))
            val_f1 = report["1"]["f1-score"]
            if best_f1 < val_f1:
                logger.green(f"New best F1 score at {val_f1}")
                best_f1 = val_f1
                if not os.path.exists(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}"):
                    Path(f"models/mlp/{model_id}/{LABEL_TO_IX[label]}").mkdir(
                        parents=True, exist_ok=True)
                torch.save(
                    model.state_dict(),
                    f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt",
                )
                results[label] = report
                if os.path.exists(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                ):
                    os.remove(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                    )
                with open(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json",
                        "w",
                ) as f:
                    json.dump(results, f)
        all_best_f1.append(best_f1)
    logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}")
    with open(f"models/mlp/{model_id}/summary.txt", "w") as f:
        f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n")
        for ix, score in enumerate(all_best_f1):
            f.write(f"{ix}: {score} \n")
        f.write("\n")
        f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n")
        f.write(f"ALPHA: {ALPHA}\n")
        f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n")
        f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n")
        f.write(f"DROPOUT: {DROPOUT}\n")
        f.write(f"TFIDF_WEIGHTS: {TFIDF_WEIGHTS}\n")
        if FEATS_TO_ADD:
            f.write(f"FEATS_TO_ADD: {FEATS_TO_ADD}\n")
            f.write(f"FEAT_ADD_SOFTENER: {FEAT_ADD_SOFTENER}\n")
        if USE_FEATS:
            f.write(f"USE_FEATS: {USE_FEATS}\n")
    mark_best_results()