Ejemplo n.º 1
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print("Getting evalution words..")
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("SentProp:")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        # method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Ejemplo n.º 2
0
def apply_embedding_transformation(embeddings, positive_seeds, negative_seeds,
                                   n_epochs=5, n_dim=10, force_orthogonal=False,
                                   plot=False, plot_points=50, plot_seeds=False,
                                   **kwargs):
    print("Preparing to learn embedding tranformation")
    dataset = DatasetMinibatchIterator(embeddings, positive_seeds, negative_seeds, **kwargs)
    model = get_model(embeddings.m.shape[1], n_dim, **kwargs)

    print("Learning embedding transformation")
    # prog = util.Progbar(n_epochs)
    for epoch in range(n_epochs):
        dataset.shuffle()
        loss = 0
        for i, tup in enumerate(dataset):
            X, y = tup[0], tup[1]
            loss += model.train_on_batch(X, y)[0] * y.size
            Q, b = model.get_weights()
            if force_orthogonal:
                Q = orthogonalize(Q)
            model.set_weights([Q, np.zeros_like(b)])
        # prog.update(epoch + 1, exact_values=[('loss', loss / dataset.y.size)])
    Q, b = model.get_weights()
    new_mat = embeddings.m.dot(Q)[:,0:n_dim]
    # print "Orthogonality rmse", np.mean(np.sqrt(
    # np.square(np.dot(Q, Q.T) - np.identity(Q.shape[0]))))

    if plot and n_dim == 2:
        plot_words = positive_seeds + negative_seeds if plot_seeds else \
            [w for w in embeddings if w not in positive_seeds and w not in negative_seeds]
        plot_words = set(random.sample(plot_words, plot_points))
        to_plot = {w: embeddings[w] for w in embeddings if w in plot_words}

        lexicon = lexicons.load_lexicon()
        plt.figure(figsize=(10, 10))
        for w, e in to_plot.items():
            plt.text(e[0], e[1], w,
                     bbox=dict(facecolor='green' if lexicon[w] == 1 else 'red', alpha=0.1))
        xmin, ymin = np.min(np.vstack(to_plot.values()), axis=0)
        xmax, ymax = np.max(np.vstack(to_plot.values()), axis=0)
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)
        plt.show()
    return Embedding(new_mat, embeddings.iw, normalize=n_dim!=1)
Ejemplo n.º 3
0
from socialsent3 import seeds
from socialsent3 import lexicons
from socialsent3.polarity_induction_methods import random_walk
from socialsent3.evaluate_methods import binary_metrics
from socialsent3.representations.representation_factory import create_representation

if __name__ == "__main__":
    # print("Evaluting SentProp with 100 dimensional GloVe embeddings")
    print("Evaluting SentProp with 300 dimensional fastText embeddings")
    print("Evaluting only binary classification performance on General Inquirer lexicon")
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    # embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/glove.6B.100d.txt",
    #                                    set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/imdb.en.vec",
                                       set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
                  if word not in pos_seeds
                  and word not in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
                             sym=True, arccos=True)

    acc, auc, avg_per = binary_metrics(polarities, lexicon, eval_words)
    print("Accuracy with best threshold: {:0.2f}".format(acc))
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
Ejemplo n.º 4
0
def hyperparam_eval():
    print("Getting evaluation words and embeddings")
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.COMMON_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD",
                                       constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]

    print("SentProp...")
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
            print("Common")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                common_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.random_walk,
                nn=nn,
                beta=beta,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words)
            print("Hist")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                hist_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.random_walk,
                nn=nn,
                beta=beta,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words)

    print("Densify...")
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
            print("LR : ", lr, "Reg: ", reg)
            print("Common")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                common_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.densify,
                lr=lr,
                regularization_strength=reg,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words, tern=False)
            print("Hist")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                hist_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.densify,
                lr=lr,
                regularization_strength=reg,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words, tern=False)
Ejemplo n.º 5
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print("Getting evaluation words and embeddings..")
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation(
        "GIGA", constants.TWITTER_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print(
        len((set(positive_seeds).union(negative_seeds)).intersection(
            embed.iw)))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [
        word for word in lexicon
        if word in s140_words and not word in positive_seeds
        and not word in negative_seeds and word in embed_words
    ]

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("Sentiment 140")
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print()

    print("SentProp")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            embed,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.densify,
                            lr=0.01,
                            regularization_strength=0.5,
                            **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print("SentProp")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        embed,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
Ejemplo n.º 6
0
def evaluate_finance_methods():
    np.random.seed(0)
    print("Getting evalution words and embeddings..")
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("finance", remove_neutral=True)

    ### padding in neutrals from GI lexicon
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0
    positive_seeds, negative_seeds = seeds.finance_seeds()
    stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS)
    stock_counts = create_representation("Explicit", constants.STOCK_COUNTS)
    common_embed = create_representation(
        "GIGA", constants.COMMON_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))

    stock_words = set(stock_embed.iw)
    common_words = set(common_embed)
    eval_words = [
        word for word in lexicon
        if word in stock_words and word in common_words
        and not word in positive_seeds and not word in negative_seeds
    ]

    stock_counts = stock_counts.get_subembed(
        set(eval_words).union(positive_seeds).union(negative_seeds),
        restrict_context=False)

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("Velikovich with 1990s Fic embeddings")
    stock_counts.normalize()
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        stock_counts,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.graph_propagate,
        T=3,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=None)
    print()

    print("PMI")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            stock_counts,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.pmi,
                            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
    print()

    print("SentProp with stock embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        stock_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)

    print("Densifier with stock embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        stock_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.densify,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
Ejemplo n.º 7
0
def evaluate_adj_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print("Getting evalution words and embeddings..")
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())
    adjs = vocab.pos_words("1990", "ADJ")

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.adj_seeds()

    common_embed = create_representation(
        "GIGA", constants.COMMON_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD",
                                       constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit",
                                        constants.COUNTS + "1990",
                                        normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    embed_words = [
        word for word in adjs if word in hist_words and word in common_words
    ]
    eval_words = [
        word for word in eval_words if word in embed_words
        and not word in positive_seeds and not word in negative_seeds
    ]

    hist_counts = hist_counts.get_subembed(
        set(eval_words).union(positive_seeds).union(negative_seeds),
        restrict_context=False)

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))
    print("Embeddings with ", len(embed_words))

    print("PMI")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            hist_counts,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.pmi,
                            boot_size=6,
                            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print()
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print("Dist with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.dist,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("Densifier with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.densify,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("SentProp with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        nn=25,
        beta=0.9,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("Velikovich with 1990s Fic embeddings")
    hist_counts.normalize()
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_counts,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.graph_propagate,
        T=3,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("SentProp with CC")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.99,
        nn=10,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print("Densifier with CC")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.densify,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)