def hyperparam_eval(): print "Getting evaluation words and embeddings" lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False) eval_words = set(lexicon.keys()) positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) common_words = set(common_embed.iw) eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990") hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) eval_words = [word for word in eval_words if not word in positive_seeds and not word in negative_seeds] print "SentProp..." for nn in [5, 10, 25, 50]: for beta in [0.8, 0.9, 0.95, 0.99]: print "Common" polarities = run_method(positive_seeds, negative_seeds, common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print "Hist" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print "Densify..." for lr in [0.001, 0.01, 0.1, 0.5]: for reg in [0.001, 0.01, 0.1, 0.5]: print "LR : ", lr, "Reg: ", reg print "Common" polarities = run_method(positive_seeds, negative_seeds, common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False) print "Hist" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False)
def worker(proc_num, queue, iter): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print proc_num, "Finished" return np.random.seed() positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print proc_num, "On year", year words = vocab.pos_words(year, "ADJ") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) print year, len(words) embed_words = set(embed.iw) words = words.intersection(embed_words) print year, len(words) # counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False) # ppmi = create_representation("Explicit", constants.COHA_PPMI + year) weight = _make_weight(float(year)) print year, weight embed = embed.get_subembed(words) test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter) polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, test_embed, method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **evaluate_methods.DEFAULT_ARGUMENTS) util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
def worker(proc_num, queue): while True: time.sleep(random.random() * 10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return positive_seeds, negative_seeds = seeds.adj_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.pos_words(year, "jj") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) embed_words = set(embed.iw) words = words.intersection(embed_words) polarities = polarity_induction_methods.bootstrap( embed.get_subembed( words.union(positive_seeds).union(negative_seeds)), positive_seeds, negative_seeds, score_method=polarity_induction_methods.random_walk, num_boots=50, n_procs=20, return_all=True, beta=0.9, nn=25) util.write_pickle(polarities, constants.POLARITIES + year + '-coha-adj-boot.pkl')
def run(subreddit, smooth=0, cds=True, normalize=False, neg=1): const = get_constants(subreddit) file_indices = const['INDICES'] file_counts = const['COUNTS'] file_ppmi = const['PPMI'] file_ppmi_index = const['PPMI_INDEX'] counts = create_representation('Explicit', file_counts, file_indices, normalize=False) old_mat = counts.m index = counts.wi smooth = old_mat.sum() * smooth # getting marginal probs row_probs = old_mat.sum(1) + smooth col_probs = old_mat.sum(0) + smooth if cds: col_probs = np.power(col_probs, 0.75) row_probs = row_probs / row_probs.sum() col_probs = col_probs / col_probs.sum() # building PPMI matrix ppmi_mat = make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=neg, normalize=normalize) sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data, file_ppmi.encode()) util.write_pickle(index, file_ppmi_index)
def worker(proc_num, queue, iter): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return np.random.seed() positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.pos_words(year, "ADJ") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) print(year, len(words)) embed_words = set(embed.iw) words = words.intersection(embed_words) print(year, len(words)) # counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False) # ppmi = create_representation("Explicit", constants.COHA_PPMI + year) weight = _make_weight(float(year)) print(year, weight) embed = embed.get_subembed(words) test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter) polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, test_embed, method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **evaluate_methods.DEFAULT_ARGUMENTS) util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
def evaluate_twitter_methods(): np.random.seed(0) print "Getting evalution words and embeddings.." gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("twitter", remove_neutral=True) scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True) sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False) # padding lexicon with neutral from GI gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice( gi_neut, int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.twitter_seeds() embed = create_representation( "GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) print len( (set(positive_seeds).union(negative_seeds)).intersection(embed.iw)) embed_words = set(embed.iw) s140_words = set(sent140.keys()) eval_words = [ word for word in lexicon if word in s140_words and not word in positive_seeds and not word in negative_seeds and word in embed_words ] print "Evaluating with ", len(eval_words), "out of", len(lexicon) print "Sentiment 140" evaluate(sent140, lexicon, eval_words, tau_lexicon=scores) print print "SentProp" polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, lr=0.01, regularization_strength=0.5, **DEFAULT_ARGUMENTS) util.write_pickle(polarities, "twitter-test.pkl") evaluate(polarities, lexicon, eval_words, tau_lexicon=scores) print "SentProp" polarities = run_method( positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1): counts = create_representation("Explicit", count_path, normalize=False) old_mat = counts.m index = counts.wi smooth = old_mat.sum() * smooth # getting marginal probs row_probs = old_mat.sum(1) + smooth col_probs = old_mat.sum(0) + smooth if cds: col_probs = np.power(col_probs, 0.75) row_probs = row_probs / row_probs.sum() col_probs = col_probs / col_probs.sum() # building PPMI matrix ppmi_mat = make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=neg, normalize=normalize) import pyximport pyximport.install(setup_args={"include_dirs": np.get_include()}) from representations import sparse_io sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data, out_path + ".bin") util.write_pickle(index, out_path + "-index.pkl")
def evaluate_methods(): """ Evaluates different methods on standard English. """ print "Getting evalution words.." np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation( "GIGA", constants.GOOGLE_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) embed_words = set(common_embed.iw) eval_words = eval_words.intersection(embed_words) eval_words = [ word for word in eval_words if not word in positive_seeds and not word in negative_seeds ] print "Evaluating with ", len(eval_words), "out of", len(lexicon) # print # print "WordNet:" # evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) # # print "Densifier:" # polarities = run_method(positive_seeds, negative_seeds, # common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), # method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, # **DEFAULT_ARGUMENTS) # evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print "SentProp:" polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.label_propagate_probabilistic, #method=polarity_induction_methods.bootstrap, beta=0.99, nn=10, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def run_sentprop(subreddit, ppmi_svd_dir, socialsent_lexicons_dir, vocab_dir, topn=5000, bstrp=False, nn=25, beta=0.9): #program = 'python make_sent_lexicons.py ' + subreddit + " " + ppmi_svd_dir + " " + socialsent_lexicons_dir + " " + vocab_dir #os.system(program) #stop_words = set(stopwords.words('english')) #stop_words.add('<#S#>') #dummy token fname = os.path.join(vocab_dir, subreddit + '.txt') with open(fname, 'r') as f: words = f.readlines() top_words = [w.split()[0] for w in words][:topn] pos_seeds, neg_seeds = seeds.twitter_seeds( ) #Twitter seed words (from socialsent package) vector_file = os.path.join(ppmi_svd_dir, subreddit + '.txt') embeddings = create_representation( 'GIGA', vector_file, set(top_words).union(pos_seeds).union(neg_seeds)) # sub_vecs if bstrp: polarities = bootstrap(embeddings, pos_seeds, neg_seeds, return_all=True, nn=nn, beta=beta, num_boots=50, n_procs=10) # NEW outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.pkl') # NEW util.write_pickle(polarities, outfile) # NEW else: polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=beta, nn=nn, num_boots=50, n_procs=10) sorted_x = sorted(polarities.items(), key=operator.itemgetter(1)) outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.txt') with open(outfile, 'w') as f: tsvin = csv.writer(f, delimiter='\t') for word in sorted_x: tsvin.writerow(word)
def evaluate_methods(): """ Evaluates different methods on standard English. """ print "Getting evalution words.." np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation("GIGA", constants.GOOGLE_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) embed_words = set(common_embed.iw) eval_words = eval_words.intersection(embed_words) eval_words = [word for word in eval_words if not word in positive_seeds and not word in negative_seeds] print "Evaluating with ", len(eval_words), "out of", len(lexicon) # print # print "WordNet:" # evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) # # print "Densifier:" # polarities = run_method(positive_seeds, negative_seeds, # common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), # method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, # **DEFAULT_ARGUMENTS) # evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print "SentProp:" polarities = run_method(positive_seeds, negative_seeds, common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.label_propagate_probabilistic, #method=polarity_induction_methods.bootstrap, beta=0.99, nn=10, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def evaluate_twitter_methods(): np.random.seed(0) print "Getting evalution words and embeddings.." gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("twitter", remove_neutral=True) scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True) sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False) # padding lexicon with neutral from GI gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.twitter_seeds() embed = create_representation("GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) print len((set(positive_seeds).union(negative_seeds)).intersection(embed.iw)) embed_words = set(embed.iw) s140_words = set(sent140.keys()) eval_words = [word for word in lexicon if word in s140_words and not word in positive_seeds and not word in negative_seeds and word in embed_words] print "Evaluating with ", len(eval_words), "out of", len(lexicon) print "Sentiment 140" evaluate(sent140, lexicon, eval_words, tau_lexicon=scores) print print "SentProp" polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, lr=0.01, regularization_strength=0.5, **DEFAULT_ARGUMENTS) util.write_pickle(polarities, "twitter-test.pkl") evaluate(polarities, lexicon, eval_words, tau_lexicon=scores) print "SentProp" polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
def main(subreddit): const = get_constants(subreddit) word_dict = util.load_pickle(const['DICTS']) word_dict.filter_extremes(no_above=const['NO_ABOVE_2'], no_below=const['NO_BELOW']) to_keep = sorted(word_dict.dfs, key=lambda w: word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) print("Create representation...") sub_vecs = create_representation('SVD', const['VECS']) if const["GENDER"]: pos_seeds, neg_seeds = seeds.gender_seeds() else: pos_seeds, neg_seeds = seeds.twitter_seeds() pos_seeds = list( set(subredditgen.normalize_text(' '.join(pos_seeds), const['STEMMING']))) neg_seeds = list( set(subredditgen.normalize_text(' '.join(neg_seeds), const['STEMMING']))) print("Get sub embedding...") sub_vecs = sub_vecs.get_subembed( set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) print("Bootstrapping...") print("using seeds {} {}".format(pos_seeds, neg_seeds)) pols = polarity_induction_methods.bootstrap( sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, boot_size=len(pos_seeds) - 2, num_boots=30, n_procs=10, ) util.write_pickle(pols, const['POLARITIES'])
"Can not read vocab words from vocab file:{f}" .format(f=vocab_file_path)) pass if not vocab_words: print( "Could not get vocab words, Moving on to other embeddings.." ) continue else: polarities = None embeddings = None sorted_x = None try: print("Loading embeddings...") embeddings = create_representation( "GIGA", embedding_abs_file_path, set(vocab_words).union( pos_seeds).union(neg_seeds)) eval_words = [ word for word in embeddings.iw if not word in pos_seeds and not word in neg_seeds ] induction_method = "label_propagate_continuous" save_dir = os.path.join( SAVE_POLARITIES_DIR, yelp_category, vocab_n, induction_method) if not (os.path.exists(save_dir) and os.path.isdir(save_dir)): os.makedirs(save_dir)
vector_dir = sys.argv[2] sent_lexicon_dir = sys.argv[3] vocab_dir = sys.argv[4] stop_words = set(stopwords.words('english')) stop_words.add('<#S#>') fname = os.path.join(vocab_dir, subreddit + '.txt') with open(fname, 'r') as f: words = f.readlines() top_5000 = [w.split()[0] for w in words if w not in stop_words][:5000] pos_seeds, neg_seeds = seeds.twitter_seeds() #Twitter seed words vector_file = os.path.join(vector_dir, subreddit + '.txt') embeddings = create_representation( 'GIGA', vector_file, set(top_5000).union(pos_seeds).union(neg_seeds)) polarities = bootstrap(embeddings, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, num_boots=2, n_procs=10) print polarities[0] # polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.9, nn=25, # num_boots=50,n_procs=10) sorted_x = sorted(polarities.items(), key=operator.itemgetter(1))
from socialsent import seeds from socialsent import lexicons from socialsent.polarity_induction_methods import random_walk from socialsent.evaluate_methods import binary_metrics from socialsent.representations.representation_factory import create_representation if __name__ == "__main__": print "Evaluting SentProp with 100 dimensional GloVe embeddings" print "Evaluting only binary classification performance on General Inquirer lexicon" lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True) pos_seeds, neg_seeds = seeds.hist_seeds() embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt", set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) eval_words = [word for word in embeddings.iw if not word in pos_seeds and not word in neg_seeds] # Using SentProp with 10 neighbors and beta=0.99 polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) acc, auc, avg_per = binary_metrics(polarities, lexicon, eval_words) print "Accuracy with best threshold: {:0.2f}".format(acc) print "ROC AUC: {:0.2f}".format(auc) print "Average precision score: {:0.2f}".format(avg_per)
word_set.update(tokens_without_sw) # if aux == 20: # break # aux += 1 #if word_set.__len__() > 20000: # 5000 ~ 5 minutos. 20000 is quite heavy # break # 8000 reviews break my memory word_list = list(word_set) model = api.load('glove-wiki-gigaword-50') # seeds have to exist in the model if densify is used, if they are not in the model an error occurs # if using densify Keras backend set to Theano pos_seeds, neg_seeds = seeds.hist_seeds() #neg_seeds = ["n***a", "bitch", "f****t", "nigger", "asshole", "m**********r", "redneck", "wetback", "retard", "gipsy"] print('Creating representations') embeddings = create_representation("GIGA_fast", model, word_list + pos_seeds + neg_seeds) #embedding_explicit = create_representation("Explicit", args.corpus) print('Generating socialsent and densify dictionary') tic = time.time() polarities_socialsent = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) toc = time.time() print('Time socialsent algorithm: ', toc-tic) polarities_densify = densify(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) tac = time.time() print('Time densify algorithm: ', tac-toc) # print('Generating pmi') #polarities_pmi = pmi(embedding_explicit, pos_seeds, neg_seeds) # polarities_socialsent = dict(polarities_socialsent) # polarities_densify = dict(polarities_densify) # values of polarities are float32 and they are needed to be float64 to be serializable by json
from socialsent import seeds from socialsent import lexicons from socialsent.polarity_induction_methods import random_walk from socialsent.evaluate_methods import binary_metrics from socialsent.representations.representation_factory import create_representation if __name__ == "__main__": print("Evaluting SentProp with 100 dimensional GloVe embeddings") print("Evaluting only binary classification performance on General Inquirer lexicon") lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True) pos_seeds, neg_seeds = seeds.hist_seeds() embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt", set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) eval_words = [word for word in embeddings.iw if not word in pos_seeds and not word in neg_seeds] # Using SentProp with 10 neighbors and beta=0.99 polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) auc, avg_per = binary_metrics(polarities, lexicon, eval_words) print("ROC AUC: {:0.2f}".format(auc)) print("Average precision score: {:0.2f}".format(avg_per))
from collections import defaultdict if __name__ == "__main__": seeds_map = defaultdict(list) labeled_words = [] f = open('./socialsent/labeled_words.txt') for l in f: w, label = l.strip().split('\t') seeds_map[int(label)].append(w) labeled_words.append(w) unlabeled_words = [] for l in open('./socialsent/unlabeled_words.txt'): unlabeled_words.append(l.strip()) embeddings = create_representation( "GIGA", "data/example_embeddings/gensim_model_20.model.txt", set(unlabeled_words).union(set(labeled_words))) eval_words = [ word for word in embeddings.iw if word not in set(labeled_words) ] # Using SentProp with 10 neighbors and beta=0.99 #polarities = random_walk(embeddings, seeds_map, beta=0.7, nn=10, # sym=True, arccos=False) #point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities]) #print "sleep_with", polarities["sleep_with"] #print "boner", polarities["boner"] #print "finger", polarities["finger"] #print "pills", polarities["pills"] #polarities = label_propagate_probabilistic(embeddings, seeds_map)
def evaluate_adj_methods(): """ Evaluate different methods on standard English, but restrict to words that are present in the 1990s portion of historical data. """ print "Getting evalution words and embeddings.." np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) adjs = vocab.pos_words("1990", "ADJ") # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.adj_seeds() common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) common_words = set(common_embed.iw) eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000") hist_counts = create_representation("Explicit", constants.COUNTS + "1990", normalize=False) hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) embed_words = [word for word in adjs if word in hist_words and word in common_words] eval_words = [word for word in eval_words if word in embed_words and not word in positive_seeds and not word in negative_seeds] hist_counts = hist_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False) print "Evaluating with ", len(eval_words), "out of", len(lexicon) print "Embeddings with ", len(embed_words) print "PMI" polarities = run_method(positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.pmi, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) print "Dist with 1990s Fic embeddings" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.dist, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print print "Densifier with 1990s Fic embeddings" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print print "SentProp with 1990s Fic embeddings" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, nn=25, beta=0.9, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print print "Velikovich with 1990s Fic embeddings" hist_counts.normalize() polarities = run_method(positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.graph_propagate, T=3, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print print "SentProp with CC" polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.99, nn=10, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print "Densifier with CC" polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
def evaluate_finance_methods(): np.random.seed(0) print "Getting evalution words and embeddings.." gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("finance", remove_neutral=True) ### padding in neutrals from GI lexicon gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.finance_seeds() stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS) stock_counts = create_representation("Explicit", constants.STOCK_COUNTS) common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) stock_words = set(stock_embed.iw) common_words = set(common_embed) eval_words = [word for word in lexicon if word in stock_words and word in common_words and not word in positive_seeds and not word in negative_seeds] stock_counts = stock_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False) print "Evaluating with ", len(eval_words), "out of", len(lexicon) print "Velikovich with 1990s Fic embeddings" stock_counts.normalize() polarities = run_method(positive_seeds, negative_seeds, stock_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.graph_propagate, T=3, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=None) print print "PMI" polarities = run_method(positive_seeds, negative_seeds, stock_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.pmi, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print print "SentProp with stock embeddings" polarities = run_method(positive_seeds, negative_seeds, stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print "Densifier with stock embeddings" polarities = run_method(positive_seeds, negative_seeds, stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words)
labeled_words_file = sys.argv[1] unlabeled_words_file = sys.argv[2] embeddings_file = sys.argv[3] output_file_prefix=sys.argv[4] seeds_map=defaultdict(list) labeled_words=[] f = open(labeled_words_file) for l in f: w, label = l.strip().split('\t') seeds_map[int(label)].append(w) labeled_words.append(w) unlabeled_words=[] for l in open(unlabeled_words_file): unlabeled_words.append(l.strip()) embeddings = create_representation("GIGA", embeddings_file, set(unlabeled_words).union(set(labeled_words))) eval_words = [word for word in embeddings.iw if word not in set(labeled_words)] # Using SentProp with 10 neighbors and beta=0.99 polarities = random_walk(embeddings, seeds_map, beta=0.7, nn=10, sym=True, arccos=False) point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities if w in unlabeled_words]) pickle.dump(polarities, open("{}_{}.pkl".format(output_file_prefix, "socialsent"),'wb')) df = pd.DataFrame().from_records(point_estimates.items(), columns=['word','label']) df.to_csv("{}_{}.csv".format(output_file_prefix, "socialsent"), sep='\t', encoding='utf-8') polarities = label_propagate_probabilistic(embeddings, seeds_map) point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities if w in unlabeled_words]) pickle.dump(polarities, open("{}_{}.pkl".format(output_file_prefix, "labelprop"),'wb')) df = pd.DataFrame().from_records(point_estimates.items(), columns=['word','label']) df.to_csv("{}_{}.csv".format(output_file_prefix, "labelprop"), sep='\t', encoding='utf-8')
X_test, Y_test = helpers.load_data('data/test_politics.csv') # First predict polarity using the general purpose lexicon lexicon = json.load(open('data/lexicons/duoman.json', 'r')) Y_pred = helpers.pred_function(X_test, lexicon) accuracy = accuracy_score(Y_test, Y_pred) print('Accuracy for general lexicon: {}'.format(accuracy)) # Use SentProp with 10 neighbors and beta=0.9 print('Running SentProp..') pos_seeds, neg_seeds = get_poliseeds() vocab = helpers.get_vocab('data/vocab.txt') embedding_file = "data/example_embeddings/politics.txt" embeddings = create_representation("GIGA", embedding_file, vocab) polarities = random_walk(embeddings, pos_seeds, neg_seeds, nn=10, sym=True, arccos=True) # Adapt the general purpose lexicon for domain specific use (with optimal parameters) print('Running lexicon adaptation..') new_lexicon = run_lexicon_adaptations(lexicon, polarities, 0.06, 0.58, 0.25) Y_pred = helpers.pred_function(X_test, new_lexicon) accuracy = accuracy_score(Y_test, Y_pred) print('Accuracy for adapted lexicon: {}'.format(accuracy))