Example #1
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print(year, len(words))
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print(year, len(words))
        #        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
        #        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print(year, weight)
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds,
                                                 test_embed,
                                                 method=polarity_induction_methods.random_walk,
                                                 beta=0.9, nn=25,
                                                 **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Example #2
0
def worker(proc_num, queue):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.top_words(year, 5100)
        stop_words = vocab.top_words(year, 100)
        words = words.difference(stop_words)
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
            embed.get_subembed(
                words.union(positive_seeds).union(negative_seeds)),
            positive_seeds,
            negative_seeds,
            score_method=polarity_induction_methods.random_walk,
            num_boots=50,
            n_procs=20,
            return_all=True,
            beta=0.9,
            nn=25)
        util.write_pickle(polarities,
                          constants.POLARITIES + year + '-coha-freq-boot.pkl')
Example #3
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print("Getting evalution words..")
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("SentProp:")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        # method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Example #4
0
from socialsent3 import seeds
from socialsent3 import lexicons
from socialsent3.polarity_induction_methods import random_walk
from socialsent3.evaluate_methods import binary_metrics
from socialsent3.representations.representation_factory import create_representation

if __name__ == "__main__":
    # print("Evaluting SentProp with 100 dimensional GloVe embeddings")
    print("Evaluting SentProp with 300 dimensional fastText embeddings")
    print("Evaluting only binary classification performance on General Inquirer lexicon")
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    # embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/glove.6B.100d.txt",
    #                                    set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/imdb.en.vec",
                                       set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
                  if word not in pos_seeds
                  and word not in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
                             sym=True, arccos=True)

    acc, auc, avg_per = binary_metrics(polarities, lexicon, eval_words)
    print("Accuracy with best threshold: {:0.2f}".format(acc))
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
Example #5
0
def hyperparam_eval():
    print("Getting evaluation words and embeddings")
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.COMMON_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD",
                                       constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]

    print("SentProp...")
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
            print("Common")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                common_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.random_walk,
                nn=nn,
                beta=beta,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words)
            print("Hist")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                hist_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.random_walk,
                nn=nn,
                beta=beta,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words)

    print("Densify...")
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
            print("LR : ", lr, "Reg: ", reg)
            print("Common")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                common_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.densify,
                lr=lr,
                regularization_strength=reg,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words, tern=False)
            print("Hist")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                hist_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.densify,
                lr=lr,
                regularization_strength=reg,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words, tern=False)
Example #6
0
def evaluate_overlap_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print("Getting evaluation words and embeddings..")
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    #    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS,
    #            eval_words.union(positive_seeds).union(negative_seeds))
    #    common_words = set(common_embed.iw)
    #    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD",
                                       constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit",
                                        constants.COHA_COUNTS + "2000",
                                        normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]

    hist_counts = hist_counts.get_subembed(
        set(eval_words).union(positive_seeds).union(negative_seeds),
        restrict_context=False)

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("PMI")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            hist_counts,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.pmi,
                            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print()
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print("SentProp with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        nn=25,
        beta=0.9,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("Densifier with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.densify,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("Velikovich with 1990s Fic embeddings")
    hist_counts.normalize()
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_counts,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.graph_propagate,
        T=3,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()