def hyperparam_eval():
    print "Getting evaluation words and embeddings"
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [word for word in eval_words
            if not word in positive_seeds 
            and not word in negative_seeds] 

    print "SentProp..."
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)

    print "Densify..."
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
          print "LR : ", lr, "Reg: ", reg
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
Ejemplo n.º 2
0
def hyperparam_eval():
    print "Getting evaluation words and embeddings"
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [word for word in eval_words
            if not word in positive_seeds 
            and not word in negative_seeds] 

    print "SentProp..."
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)

    print "Densify..."
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
          print "LR : ", lr, "Reg: ", reg
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
Ejemplo n.º 3
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print proc_num, "On year", year
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print year, len(words)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print year,  len(words)
#        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
#        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print year, weight
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, 
                 test_embed,
                 method=polarity_induction_methods.random_walk, 
                 beta=0.9, nn=25,
                **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Ejemplo n.º 4
0
def worker(proc_num, queue):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        positive_seeds, negative_seeds = seeds.adj_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "jj")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
            embed.get_subembed(
                words.union(positive_seeds).union(negative_seeds)),
            positive_seeds,
            negative_seeds,
            score_method=polarity_induction_methods.random_walk,
            num_boots=50,
            n_procs=20,
            return_all=True,
            beta=0.9,
            nn=25)
        util.write_pickle(polarities,
                          constants.POLARITIES + year + '-coha-adj-boot.pkl')
Ejemplo n.º 5
0
def run(subreddit, smooth=0, cds=True, normalize=False, neg=1):
    const = get_constants(subreddit)
    file_indices = const['INDICES']
    file_counts = const['COUNTS']
    file_ppmi = const['PPMI']
    file_ppmi_index = const['PPMI_INDEX']

    counts = create_representation('Explicit',
                                   file_counts,
                                   file_indices,
                                   normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat,
                             row_probs,
                             col_probs,
                             smooth,
                             neg=neg,
                             normalize=normalize)

    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data,
                             file_ppmi.encode())
    util.write_pickle(index, file_ppmi_index)
Ejemplo n.º 6
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print(year, len(words))
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print(year,  len(words))
#        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
#        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print(year, weight)
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, 
                 test_embed,
                 method=polarity_induction_methods.random_walk, 
                 beta=0.9, nn=25,
                **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Ejemplo n.º 7
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation(
        "GIGA", constants.TWITTER_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len(
        (set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [
        word for word in lexicon
        if word in s140_words and not word in positive_seeds
        and not word in negative_seeds and word in embed_words
    ]

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            embed,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.densify,
                            lr=0.01,
                            regularization_strength=0.5,
                            **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        embed,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
Ejemplo n.º 8
0
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1):
    counts = create_representation("Explicit", count_path, normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat,
                             row_probs,
                             col_probs,
                             smooth,
                             neg=neg,
                             normalize=normalize)
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from representations import sparse_io
    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data,
                             out_path + ".bin")
    util.write_pickle(index, out_path + "-index.pkl")
Ejemplo n.º 9
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    #    print
    #    print "WordNet:"
    #    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
    #
    #    print "Densifier:"
    #    polarities = run_method(positive_seeds, negative_seeds,
    #            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
    #            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
    #            **DEFAULT_ARGUMENTS)
    #    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        #method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Ejemplo n.º 10
0
def run_sentprop(subreddit,
                 ppmi_svd_dir,
                 socialsent_lexicons_dir,
                 vocab_dir,
                 topn=5000,
                 bstrp=False,
                 nn=25,
                 beta=0.9):
    #program = 'python make_sent_lexicons.py ' + subreddit + " " + ppmi_svd_dir + " " + socialsent_lexicons_dir + " " + vocab_dir
    #os.system(program)

    #stop_words = set(stopwords.words('english'))
    #stop_words.add('<#S#>') #dummy token

    fname = os.path.join(vocab_dir, subreddit + '.txt')
    with open(fname, 'r') as f:
        words = f.readlines()

    top_words = [w.split()[0] for w in words][:topn]
    pos_seeds, neg_seeds = seeds.twitter_seeds(
    )  #Twitter seed words (from socialsent package)

    vector_file = os.path.join(ppmi_svd_dir, subreddit + '.txt')
    embeddings = create_representation(
        'GIGA', vector_file,
        set(top_words).union(pos_seeds).union(neg_seeds))  # sub_vecs

    if bstrp:
        polarities = bootstrap(embeddings,
                               pos_seeds,
                               neg_seeds,
                               return_all=True,
                               nn=nn,
                               beta=beta,
                               num_boots=50,
                               n_procs=10)  # NEW
        outfile = os.path.join(socialsent_lexicons_dir,
                               subreddit + '.pkl')  # NEW
        util.write_pickle(polarities, outfile)  # NEW
    else:
        polarities = random_walk(embeddings,
                                 pos_seeds,
                                 neg_seeds,
                                 beta=beta,
                                 nn=nn,
                                 num_boots=50,
                                 n_procs=10)
        sorted_x = sorted(polarities.items(), key=operator.itemgetter(1))
        outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.txt')

        with open(outfile, 'w') as f:
            tsvin = csv.writer(f, delimiter='\t')
            for word in sorted_x:
                tsvin.writerow(word)
Ejemplo n.º 11
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.GOOGLE_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [word for word in eval_words 
            if not word in positive_seeds 
            and not word in negative_seeds]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

#    print
#    print "WordNet:"
#    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
#
#    print "Densifier:"
#    polarities = run_method(positive_seeds, negative_seeds, 
#            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
#            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
#            **DEFAULT_ARGUMENTS)
#    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(positive_seeds, negative_seeds, 
            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
            method=polarity_induction_methods.label_propagate_probabilistic,
            #method=polarity_induction_methods.bootstrap, 
            beta=0.99, nn=10,

            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Ejemplo n.º 12
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation("GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len((set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [word for word in lexicon if word in s140_words and
            not word in positive_seeds 
            and not word in negative_seeds
            and word in embed_words] 

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        lr=0.01, regularization_strength=0.5,
                        **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
Ejemplo n.º 13
0
def main(subreddit):
    const = get_constants(subreddit)

    word_dict = util.load_pickle(const['DICTS'])
    word_dict.filter_extremes(no_above=const['NO_ABOVE_2'],
                              no_below=const['NO_BELOW'])
    to_keep = sorted(word_dict.dfs,
                     key=lambda w: word_dict.dfs[w],
                     reverse=True)[:5000]
    word_dict.filter_tokens(good_ids=to_keep)

    print("Create representation...")
    sub_vecs = create_representation('SVD', const['VECS'])
    if const["GENDER"]:
        pos_seeds, neg_seeds = seeds.gender_seeds()
    else:
        pos_seeds, neg_seeds = seeds.twitter_seeds()

    pos_seeds = list(
        set(subredditgen.normalize_text(' '.join(pos_seeds),
                                        const['STEMMING'])))
    neg_seeds = list(
        set(subredditgen.normalize_text(' '.join(neg_seeds),
                                        const['STEMMING'])))

    print("Get sub embedding...")
    sub_vecs = sub_vecs.get_subembed(
        set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))

    print("Bootstrapping...")
    print("using seeds {} {}".format(pos_seeds, neg_seeds))
    pols = polarity_induction_methods.bootstrap(
        sub_vecs,
        pos_seeds,
        neg_seeds,
        return_all=True,
        nn=25,
        beta=0.9,
        boot_size=len(pos_seeds) - 2,
        num_boots=30,
        n_procs=10,
    )

    util.write_pickle(pols, const['POLARITIES'])
Ejemplo n.º 14
0
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1):
    counts = create_representation("Explicit", count_path, normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=neg, normalize=normalize)
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from representations import sparse_io
    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data, out_path + ".bin")
    util.write_pickle(index, out_path + "-index.pkl")
                                                    "Can not read vocab words from vocab file:{f}"
                                                    .format(f=vocab_file_path))
                                                pass
                            if not vocab_words:
                                print(
                                    "Could not get vocab words, Moving on to other embeddings.."
                                )
                                continue
                            else:
                                polarities = None
                                embeddings = None
                                sorted_x = None
                                try:
                                    print("Loading embeddings...")
                                    embeddings = create_representation(
                                        "GIGA", embedding_abs_file_path,
                                        set(vocab_words).union(
                                            pos_seeds).union(neg_seeds))

                                    eval_words = [
                                        word for word in embeddings.iw
                                        if not word in pos_seeds
                                        and not word in neg_seeds
                                    ]

                                    induction_method = "label_propagate_continuous"
                                    save_dir = os.path.join(
                                        SAVE_POLARITIES_DIR, yelp_category,
                                        vocab_n, induction_method)
                                    if not (os.path.exists(save_dir)
                                            and os.path.isdir(save_dir)):
                                        os.makedirs(save_dir)
Ejemplo n.º 16
0
    vector_dir = sys.argv[2]
    sent_lexicon_dir = sys.argv[3]
    vocab_dir = sys.argv[4]
    stop_words = set(stopwords.words('english'))
    stop_words.add('<#S#>')

    fname = os.path.join(vocab_dir, subreddit + '.txt')
    with open(fname, 'r') as f:
        words = f.readlines()

    top_5000 = [w.split()[0] for w in words if w not in stop_words][:5000]

    pos_seeds, neg_seeds = seeds.twitter_seeds()  #Twitter seed words
    vector_file = os.path.join(vector_dir, subreddit + '.txt')
    embeddings = create_representation(
        'GIGA', vector_file,
        set(top_5000).union(pos_seeds).union(neg_seeds))

    polarities = bootstrap(embeddings,
                           pos_seeds,
                           neg_seeds,
                           return_all=True,
                           nn=25,
                           beta=0.9,
                           num_boots=2,
                           n_procs=10)
    print polarities[0]

    # polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.9, nn=25,
    #         num_boots=50,n_procs=10)
    sorted_x = sorted(polarities.items(), key=operator.itemgetter(1))
Ejemplo n.º 17
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print "Evaluting SentProp with 100 dimensional GloVe embeddings"
    print "Evaluting only binary classification performance on General Inquirer lexicon"
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds 
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    acc, auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print "Accuracy with best threshold: {:0.2f}".format(acc)
    print "ROC AUC: {:0.2f}".format(auc)
    print "Average precision score: {:0.2f}".format(avg_per)

Ejemplo n.º 18
0
    word_set.update(tokens_without_sw)
    # if aux == 20:
    #     break
    # aux += 1
    #if word_set.__len__() > 20000:   # 5000 ~ 5 minutos. 20000 is quite heavy
    #    break     # 8000 reviews break my memory
word_list = list(word_set)

model = api.load('glove-wiki-gigaword-50')
# seeds have to exist in the model if densify is used, if they are not in the model an error occurs
# if using densify Keras backend set to Theano
pos_seeds, neg_seeds = seeds.hist_seeds()
#neg_seeds = ["n***a", "bitch", "f****t", "nigger", "asshole", "m**********r", "redneck", "wetback", "retard", "gipsy"]

print('Creating representations')
embeddings = create_representation("GIGA_fast", model, word_list + pos_seeds + neg_seeds)
#embedding_explicit = create_representation("Explicit", args.corpus)

print('Generating socialsent and densify dictionary')
tic = time.time()
polarities_socialsent = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True)
toc = time.time()
print('Time socialsent algorithm: ', toc-tic)
polarities_densify = densify(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True)
tac = time.time()
print('Time densify algorithm: ', tac-toc)
# print('Generating pmi')
#polarities_pmi = pmi(embedding_explicit, pos_seeds, neg_seeds)
# polarities_socialsent =  dict(polarities_socialsent)
# polarities_densify =  dict(polarities_densify)
# values of polarities are float32 and they are needed to be float64 to be serializable by json
Ejemplo n.º 19
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print("Evaluting SentProp with 100 dimensional GloVe embeddings")
    print("Evaluting only binary classification performance on General Inquirer lexicon")
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
Ejemplo n.º 20
0
from collections import defaultdict

if __name__ == "__main__":
    seeds_map = defaultdict(list)
    labeled_words = []
    f = open('./socialsent/labeled_words.txt')
    for l in f:
        w, label = l.strip().split('\t')
        seeds_map[int(label)].append(w)
        labeled_words.append(w)
    unlabeled_words = []
    for l in open('./socialsent/unlabeled_words.txt'):
        unlabeled_words.append(l.strip())

    embeddings = create_representation(
        "GIGA", "data/example_embeddings/gensim_model_20.model.txt",
        set(unlabeled_words).union(set(labeled_words)))
    eval_words = [
        word for word in embeddings.iw if word not in set(labeled_words)
    ]

    # Using SentProp with 10 neighbors and beta=0.99
    #polarities = random_walk(embeddings, seeds_map, beta=0.7, nn=10,
    #        sym=True, arccos=False)
    #point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities])
    #print "sleep_with", polarities["sleep_with"]
    #print "boner", polarities["boner"]
    #print "finger", polarities["finger"]
    #print "pills", polarities["pills"]

    #polarities = label_propagate_probabilistic(embeddings, seeds_map)
Ejemplo n.º 21
0
def evaluate_adj_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print "Getting evalution words and embeddings.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())
    adjs = vocab.pos_words("1990", "ADJ")

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.adj_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit", constants.COUNTS + "1990", normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    embed_words = [word for word in adjs if word in hist_words and word in common_words]
    eval_words = [word for word in eval_words if word in embed_words
            and not word in positive_seeds 
            and not word in negative_seeds] 
    
    hist_counts = hist_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), 
            restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)
    print "Embeddings with ", len(embed_words)

    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            hist_counts,
            method=polarity_induction_methods.bootstrap,
            score_method=polarity_induction_methods.pmi,
            boot_size=6,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print "Dist with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.dist, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Densifier with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        nn=25, beta=0.9,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Velikovich with 1990s Fic embeddings"
    hist_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.99, nn=10,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "Densifier with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
Ejemplo n.º 22
0
def evaluate_finance_methods():
    np.random.seed(0)
    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("finance", remove_neutral=True)

    ### padding in neutrals from GI lexicon
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0
    positive_seeds, negative_seeds = seeds.finance_seeds()
    stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS)
    stock_counts = create_representation("Explicit", constants.STOCK_COUNTS)
    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))

    stock_words = set(stock_embed.iw)
    common_words = set(common_embed)
    eval_words = [word for word in lexicon if word in stock_words and
            word in common_words and
            not word in positive_seeds and  
            not word in negative_seeds]

    stock_counts = stock_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Velikovich with 1990s Fic embeddings"
    stock_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=None)
    print


    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            stock_counts,
            method=polarity_induction_methods.bootstrap, 
            score_method=polarity_induction_methods.pmi,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
    print

    print "SentProp with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)

    print "Densifier with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
Ejemplo n.º 23
0
def evaluate_finance_methods():
    np.random.seed(0)
    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("finance", remove_neutral=True)

    ### padding in neutrals from GI lexicon
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0
    positive_seeds, negative_seeds = seeds.finance_seeds()
    stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS)
    stock_counts = create_representation("Explicit", constants.STOCK_COUNTS)
    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))

    stock_words = set(stock_embed.iw)
    common_words = set(common_embed)
    eval_words = [word for word in lexicon if word in stock_words and
            word in common_words and
            not word in positive_seeds and  
            not word in negative_seeds]

    stock_counts = stock_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Velikovich with 1990s Fic embeddings"
    stock_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=None)
    print


    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            stock_counts,
            method=polarity_induction_methods.bootstrap, 
            score_method=polarity_induction_methods.pmi,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
    print

    print "SentProp with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)

    print "Densifier with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
Ejemplo n.º 24
0
def evaluate_adj_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print "Getting evalution words and embeddings.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())
    adjs = vocab.pos_words("1990", "ADJ")

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.adj_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit", constants.COUNTS + "1990", normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    embed_words = [word for word in adjs if word in hist_words and word in common_words]
    eval_words = [word for word in eval_words if word in embed_words
            and not word in positive_seeds 
            and not word in negative_seeds] 
    
    hist_counts = hist_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), 
            restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)
    print "Embeddings with ", len(embed_words)

    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            hist_counts,
            method=polarity_induction_methods.bootstrap,
            score_method=polarity_induction_methods.pmi,
            boot_size=6,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print "Dist with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.dist, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Densifier with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        nn=25, beta=0.9,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Velikovich with 1990s Fic embeddings"
    hist_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.99, nn=10,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "Densifier with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
Ejemplo n.º 25
0
    labeled_words_file = sys.argv[1]
    unlabeled_words_file = sys.argv[2]
    embeddings_file = sys.argv[3]
    output_file_prefix=sys.argv[4]
    seeds_map=defaultdict(list)
    labeled_words=[]
    f = open(labeled_words_file)
    for l in f:
        w, label = l.strip().split('\t')
        seeds_map[int(label)].append(w)
        labeled_words.append(w)
    unlabeled_words=[]
    for l in open(unlabeled_words_file):
        unlabeled_words.append(l.strip())

    embeddings = create_representation("GIGA", embeddings_file, set(unlabeled_words).union(set(labeled_words)))
    eval_words = [word for word in embeddings.iw if word not in set(labeled_words)]

    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, seeds_map, beta=0.7, nn=10, sym=True, arccos=False)
    point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities if w in unlabeled_words])
    pickle.dump(polarities, open("{}_{}.pkl".format(output_file_prefix, "socialsent"),'wb'))
    df = pd.DataFrame().from_records(point_estimates.items(), columns=['word','label'])
    df.to_csv("{}_{}.csv".format(output_file_prefix, "socialsent"), sep='\t', encoding='utf-8')

    polarities = label_propagate_probabilistic(embeddings, seeds_map)
    point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities if w in unlabeled_words])
    pickle.dump(polarities, open("{}_{}.pkl".format(output_file_prefix, "labelprop"),'wb'))
    df = pd.DataFrame().from_records(point_estimates.items(), columns=['word','label'])
    df.to_csv("{}_{}.csv".format(output_file_prefix, "labelprop"), sep='\t', encoding='utf-8')
Ejemplo n.º 26
0
    X_test, Y_test = helpers.load_data('data/test_politics.csv')

    # First predict polarity using the general purpose lexicon
    lexicon = json.load(open('data/lexicons/duoman.json', 'r'))
    Y_pred = helpers.pred_function(X_test, lexicon)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('Accuracy for general lexicon: {}'.format(accuracy))

    # Use SentProp with 10 neighbors and beta=0.9
    print('Running SentProp..')
    pos_seeds, neg_seeds = get_poliseeds()
    vocab = helpers.get_vocab('data/vocab.txt')

    embedding_file = "data/example_embeddings/politics.txt"

    embeddings = create_representation("GIGA", embedding_file, vocab)

    polarities = random_walk(embeddings,
                             pos_seeds,
                             neg_seeds,
                             nn=10,
                             sym=True,
                             arccos=True)

    # Adapt the general purpose lexicon for domain specific use (with optimal parameters)
    print('Running lexicon adaptation..')
    new_lexicon = run_lexicon_adaptations(lexicon, polarities, 0.06, 0.58,
                                          0.25)
    Y_pred = helpers.pred_function(X_test, new_lexicon)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('Accuracy for adapted lexicon: {}'.format(accuracy))