Esempio n. 1
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print(year, len(words))
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print(year, len(words))
        #        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
        #        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print(year, weight)
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds,
                                                 test_embed,
                                                 method=polarity_induction_methods.random_walk,
                                                 beta=0.9, nn=25,
                                                 **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Esempio n. 2
0
def worker(proc_num, queue):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        positive_seeds, negative_seeds = seeds.adj_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "jj")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
            embed.get_subembed(
                words.union(positive_seeds).union(negative_seeds)),
            positive_seeds,
            negative_seeds,
            score_method=polarity_induction_methods.random_walk,
            num_boots=50,
            n_procs=20,
            return_all=True,
            beta=0.9,
            nn=25)
        util.write_pickle(polarities,
                          constants.POLARITIES + year + '-coha-adj-boot.pkl')
Esempio n. 3
0
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1):
    counts = create_representation("Explicit", count_path, normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat,
                             row_probs,
                             col_probs,
                             smooth,
                             neg=neg,
                             normalize=normalize)
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from socialsent3.representations import sparse_io
    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data,
                             out_path + ".bin")
    util.write_pickle(index, out_path + "-index.pkl")
Esempio n. 4
0
def run(in_file, out_path, dim=300, keep_words=None):
    base_embed = Explicit.load(in_file, normalize=False)
    if keep_words is not None:
        base_embed = base_embed.get_subembed(keep_words)
    u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
    np.save(out_path + "-u.npy", u)
    np.save(out_path + "-v.npy", v)
    np.save(out_path + "-s.npy", s)
    util.write_pickle(base_embed.iw, out_path + "-vocab.pkl")
Esempio n. 5
0
def main(subreddit):
    out_path = OUT.format(subreddit)
    util.mkdir(out_path)

    print("Getting and writing dictionary...")
    gdict = util.load_pickle(DICTS.format(subreddit))
    gdict.filter_extremes(no_above=0.5, no_below=100)
    gdict.compactify()
    util.write_pickle(gdict.token2id, out_path + "index.pkl")

    print("Generating word co-occurrences...")
    cooccurgen.run(word_gen(COMMENTS.format(subreddit), gdict), gdict.token2id,
                   4, out_path + "counts.bin")
    print("Generating PPMI vectors...")
    ppmigen.run(out_path + "counts.bin", out_path + "ppmi", cds=True)
    print("Generating SVD vectors...")
    makelowdim.run(out_path + "ppmi.bin", out_path + "vecs")
Esempio n. 6
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print("Getting evalution words..")
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("SentProp:")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        # method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Esempio n. 7
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print("Getting evaluation words and embeddings..")
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation(
        "GIGA", constants.TWITTER_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print(
        len((set(positive_seeds).union(negative_seeds)).intersection(
            embed.iw)))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [
        word for word in lexicon
        if word in s140_words and not word in positive_seeds
        and not word in negative_seeds and word in embed_words
    ]

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("Sentiment 140")
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print()

    print("SentProp")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            embed,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.densify,
                            lr=0.01,
                            regularization_strength=0.5,
                            **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print("SentProp")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        embed,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)