Esempio n. 1
0
def worker(proc_num, queue):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        positive_seeds, negative_seeds = seeds.adj_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "jj")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
            embed.get_subembed(
                words.union(positive_seeds).union(negative_seeds)),
            positive_seeds,
            negative_seeds,
            score_method=polarity_induction_methods.random_walk,
            num_boots=50,
            n_procs=20,
            return_all=True,
            beta=0.9,
            nn=25)
        util.write_pickle(polarities,
                          constants.POLARITIES + year + '-coha-adj-boot.pkl')
Esempio n. 2
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation(
        "GIGA", constants.TWITTER_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len(
        (set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [
        word for word in lexicon
        if word in s140_words and not word in positive_seeds
        and not word in negative_seeds and word in embed_words
    ]

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            embed,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.densify,
                            lr=0.01,
                            regularization_strength=0.5,
                            **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        embed,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
Esempio n. 3
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print(year, len(words))
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print(year,  len(words))
#        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
#        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print(year, weight)
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, 
                 test_embed,
                 method=polarity_induction_methods.random_walk, 
                 beta=0.9, nn=25,
                **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Esempio n. 4
0
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1):
    counts = create_representation("Explicit", count_path, normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat,
                             row_probs,
                             col_probs,
                             smooth,
                             neg=neg,
                             normalize=normalize)
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from representations import sparse_io
    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data,
                             out_path + ".bin")
    util.write_pickle(index, out_path + "-index.pkl")
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print proc_num, "On year", year
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print year, len(words)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print year,  len(words)
#        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
#        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print year, weight
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, 
                 test_embed,
                 method=polarity_induction_methods.random_walk, 
                 beta=0.9, nn=25,
                **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Esempio n. 6
0
def run(subreddit, smooth=0, cds=True, normalize=False, neg=1):
    const = get_constants(subreddit)
    file_indices = const['INDICES']
    file_counts = const['COUNTS']
    file_ppmi = const['PPMI']
    file_ppmi_index = const['PPMI_INDEX']

    counts = create_representation('Explicit',
                                   file_counts,
                                   file_indices,
                                   normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat,
                             row_probs,
                             col_probs,
                             smooth,
                             neg=neg,
                             normalize=normalize)

    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data,
                             file_ppmi.encode())
    util.write_pickle(index, file_ppmi_index)
Esempio n. 7
0
def run(in_file, out_path, dim=300, keep_words=None):
    base_embed = Explicit.load(in_file, normalize=False)
    if keep_words != None:
        base_embed = base_embed.get_subembed(keep_words)
    u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
    np.save(out_path + "-u.npy", u)
    np.save(out_path + "-v.npy", v)
    np.save(out_path + "-s.npy", s)
    util.write_pickle(base_embed.iw, out_path + "-vocab.pkl")
Esempio n. 8
0
def run(in_file, out_path, dim=300, keep_words=None): 
        base_embed = Explicit.load(in_file, normalize=False)
        if keep_words != None:
            base_embed = base_embed.get_subembed(keep_words)
        u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
        np.save(out_path + "-u.npy", u)
        np.save(out_path + "-v.npy", v)
        np.save(out_path + "-s.npy", s)
        util.write_pickle(base_embed.iw, out_path  + "-vocab.pkl")
Esempio n. 9
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    #    print
    #    print "WordNet:"
    #    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
    #
    #    print "Densifier:"
    #    polarities = run_method(positive_seeds, negative_seeds,
    #            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
    #            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
    #            **DEFAULT_ARGUMENTS)
    #    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        #method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def run(index_path, in_file, out_path, dim=300, keep_words=None):
    base_embed = Explicit.load(in_file, index_path, normalize=False)
    if keep_words != None:
        base_embed = base_embed.get_subembed(keep_words)
    print("Factorising {} with shape {}".format(base_embed.m.nnz,
                                                base_embed.m.shape))
    u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
    np.save(out_path + "-u.npy", u)
    np.save(out_path + "-v.npy", v)
    np.save(out_path + "-s.npy", s)
    util.write_pickle(base_embed.iw, out_path + "-vocab.pkl")
Esempio n. 11
0
def run_sentprop(subreddit,
                 ppmi_svd_dir,
                 socialsent_lexicons_dir,
                 vocab_dir,
                 topn=5000,
                 bstrp=False,
                 nn=25,
                 beta=0.9):
    #program = 'python make_sent_lexicons.py ' + subreddit + " " + ppmi_svd_dir + " " + socialsent_lexicons_dir + " " + vocab_dir
    #os.system(program)

    #stop_words = set(stopwords.words('english'))
    #stop_words.add('<#S#>') #dummy token

    fname = os.path.join(vocab_dir, subreddit + '.txt')
    with open(fname, 'r') as f:
        words = f.readlines()

    top_words = [w.split()[0] for w in words][:topn]
    pos_seeds, neg_seeds = seeds.twitter_seeds(
    )  #Twitter seed words (from socialsent package)

    vector_file = os.path.join(ppmi_svd_dir, subreddit + '.txt')
    embeddings = create_representation(
        'GIGA', vector_file,
        set(top_words).union(pos_seeds).union(neg_seeds))  # sub_vecs

    if bstrp:
        polarities = bootstrap(embeddings,
                               pos_seeds,
                               neg_seeds,
                               return_all=True,
                               nn=nn,
                               beta=beta,
                               num_boots=50,
                               n_procs=10)  # NEW
        outfile = os.path.join(socialsent_lexicons_dir,
                               subreddit + '.pkl')  # NEW
        util.write_pickle(polarities, outfile)  # NEW
    else:
        polarities = random_walk(embeddings,
                                 pos_seeds,
                                 neg_seeds,
                                 beta=beta,
                                 nn=nn,
                                 num_boots=50,
                                 n_procs=10)
        sorted_x = sorted(polarities.items(), key=operator.itemgetter(1))
        outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.txt')

        with open(outfile, 'w') as f:
            tsvin = csv.writer(f, delimiter='\t')
            for word in sorted_x:
                tsvin.writerow(word)
Esempio n. 12
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.GOOGLE_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [word for word in eval_words 
            if not word in positive_seeds 
            and not word in negative_seeds]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

#    print
#    print "WordNet:"
#    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
#
#    print "Densifier:"
#    polarities = run_method(positive_seeds, negative_seeds, 
#            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
#            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
#            **DEFAULT_ARGUMENTS)
#    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(positive_seeds, negative_seeds, 
            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
            method=polarity_induction_methods.label_propagate_probabilistic,
            #method=polarity_induction_methods.bootstrap, 
            beta=0.99, nn=10,

            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation("GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len((set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [word for word in lexicon if word in s140_words and
            not word in positive_seeds 
            and not word in negative_seeds
            and word in embed_words] 

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        lr=0.01, regularization_strength=0.5,
                        **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
Esempio n. 14
0
def main(subreddit):
    out_path = OUT.format(subreddit)
    util.mkdir(out_path)

    print "Getting and writing dictionary..."
    gdict = util.load_pickle(DICTS.format(subreddit))
    gdict.filter_extremes(no_above=0.5, no_below=100)
    gdict.compactify()
    util.write_pickle(gdict.token2id, out_path + "index.pkl")

    print "Generating word co-occurrences..."
    cooccurgen.run(word_gen(COMMENTS.format(subreddit), gdict), gdict.token2id, 4, out_path + "counts.bin")
    print "Generating PPMI vectors..."
    ppmigen.run(out_path + "counts.bin", out_path + "ppmi", cds=True)
    print "Generating SVD vectors..."
    makelowdim.run(out_path + "ppmi.bin", out_path + "vecs")
Esempio n. 15
0
def main(subreddit):
    const = get_constants(subreddit)

    word_dict = util.load_pickle(const['DICTS'])
    word_dict.filter_extremes(no_above=const['NO_ABOVE_2'],
                              no_below=const['NO_BELOW'])
    to_keep = sorted(word_dict.dfs,
                     key=lambda w: word_dict.dfs[w],
                     reverse=True)[:5000]
    word_dict.filter_tokens(good_ids=to_keep)

    print("Create representation...")
    sub_vecs = create_representation('SVD', const['VECS'])
    if const["GENDER"]:
        pos_seeds, neg_seeds = seeds.gender_seeds()
    else:
        pos_seeds, neg_seeds = seeds.twitter_seeds()

    pos_seeds = list(
        set(subredditgen.normalize_text(' '.join(pos_seeds),
                                        const['STEMMING'])))
    neg_seeds = list(
        set(subredditgen.normalize_text(' '.join(neg_seeds),
                                        const['STEMMING'])))

    print("Get sub embedding...")
    sub_vecs = sub_vecs.get_subembed(
        set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))

    print("Bootstrapping...")
    print("using seeds {} {}".format(pos_seeds, neg_seeds))
    pols = polarity_induction_methods.bootstrap(
        sub_vecs,
        pos_seeds,
        neg_seeds,
        return_all=True,
        nn=25,
        beta=0.9,
        boot_size=len(pos_seeds) - 2,
        num_boots=30,
        n_procs=10,
    )

    util.write_pickle(pols, const['POLARITIES'])
Esempio n. 16
0
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1):
    counts = create_representation("Explicit", count_path, normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=neg, normalize=normalize)
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from representations import sparse_io
    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data, out_path + ".bin")
    util.write_pickle(index, out_path + "-index.pkl")
Esempio n. 17
0
def main(subreddit):
    const = get_constants(subreddit)

    if os.path.exists(const['CORPUS']):
        print("Loading preexisting corpus...")
        corpus = util.load_pickle(const['CORPUS'])
    else:
        print("Getting and writing dictionary...")

        with open(const['OUTPUTS'], "r") as f:
            num_lines = sum(1 for line in f)


        with open(const['OUTPUTS'], "r") as f:
            dicts = (json.loads(comment) for comment in tqdm(f, total=num_lines))

            if const["INTERVAL"] is not None:
                corpuses = [[] for interval in const["ALL_INTERVALS"]]

                for comment in dicts:
                    i = get_interval_idx(comment["score"])
                    corpuses[i].append(normalize_text(comment["body"], const['STEMMING']))

                for i, interval in enumerate(const["ALL_INTERVALS"]):
                    util.write_pickle(corpuses[i], get_interval_fname(subreddit, interval))

                corpus = corpuses[0]
            else:
                corpus = [normalize_text(comment["body"], const['STEMMING']) for comment in dicts]

                
    gdict = Dictionary(
        corpus
    )

    gdict.filter_extremes(no_above=const['NO_ABOVE_1'], no_below=const['NO_BELOW'])
    gdict.compactify()


    util.write_pickle(gdict.token2id, const['INDICES'])
    util.write_pickle(gdict, const['DICTS'])


    print("Generating word co-occurrences...")
    cooccurgen.run(
       word_gen(corpus, gdict, subreddit, len(corpus)),
       gdict.token2id,
       4,
       const['COUNTS']
    )
    print("Generating PPMI vectors...")
    ppmigen.run(subreddit, cds=True)
    print("Generating SVD vectors...")
    makelowdim.run(const['INDICES'], const['PPMI'], const['VECS'])