def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1): counts = create_representation("Explicit", count_path, normalize=False) old_mat = counts.m index = counts.wi smooth = old_mat.sum() * smooth # getting marginal probs row_probs = old_mat.sum(1) + smooth col_probs = old_mat.sum(0) + smooth if cds: col_probs = np.power(col_probs, 0.75) row_probs = row_probs / row_probs.sum() col_probs = col_probs / col_probs.sum() # building PPMI matrix ppmi_mat = make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=neg, normalize=normalize) import pyximport pyximport.install(setup_args={"include_dirs": np.get_include()}) from socialsent3.representations import sparse_io sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data, out_path + ".bin") util.write_pickle(index, out_path + "-index.pkl")
def worker(proc_num, queue): while True: time.sleep(random.random() * 10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return positive_seeds, negative_seeds = seeds.adj_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.pos_words(year, "jj") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) embed_words = set(embed.iw) words = words.intersection(embed_words) polarities = polarity_induction_methods.bootstrap( embed.get_subembed( words.union(positive_seeds).union(negative_seeds)), positive_seeds, negative_seeds, score_method=polarity_induction_methods.random_walk, num_boots=50, n_procs=20, return_all=True, beta=0.9, nn=25) util.write_pickle(polarities, constants.POLARITIES + year + '-coha-adj-boot.pkl')
def worker(proc_num, queue, iter): while True: time.sleep(random.random() * 10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return np.random.seed() positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.pos_words(year, "ADJ") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) print(year, len(words)) embed_words = set(embed.iw) words = words.intersection(embed_words) print(year, len(words)) # counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False) # ppmi = create_representation("Explicit", constants.COHA_PPMI + year) weight = _make_weight(float(year)) print(year, weight) embed = embed.get_subembed(words) test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter) polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, test_embed, method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **evaluate_methods.DEFAULT_ARGUMENTS) util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
def main(args): print('Loading data...') train_sents, train_labels = [], [] with open(args.TRAIN, 'rt') as f: lines = f.readlines() for l in lines: cols = l.split('\t') train_labels.append(1 if cols[0] == 'Positive' else 0) train_sents.append(cols[1].strip()) test_sents, test_labels = [], [] with open(args.TEST, 'rt') as f: lines = f.readlines() for l in lines: cols = l.split('\t') test_labels.append(1 if cols[0] == 'Positive' else 0) test_sents.append(cols[1].strip()) pos_seeds, neg_seeds = seeds.review_seeds() print('Creating word vectors...') embeddings = create_representation("FULL", args.EMBED, 100, limit=30000) print('Calculating polarities...') polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) print('Storing polarities...') dict2csv(polarities, path='./data/polarities/default.csv') print('Creating training sentence representations...') train_reps = [] for i, sent in enumerate(train_sents): print('\t%d/%d' % (i + 1, len(train_sents)), end='\r') rep = sent2rep(sent, polarities) train_reps.append(rep) print() print('Creating testing sentence representations...') test_reps = [] for i, sent in enumerate(test_sents): print('\t%d/%d' % (i + 1, len(test_sents)), end='\r') rep = sent2rep(sent, polarities) test_reps.append(rep) print() km = KMeans(n_clusters=2, verbose=1, max_iter=10000) train_preds = km.fit_predict(train_reps) test_preds = km.predict(test_reps) print('\nMetrics on train set:') evaluate(train_labels, train_preds) print('\nMetrics on test set:') evaluate(test_labels, test_preds)
def evaluate_methods(): """ Evaluates different methods on standard English. """ print("Getting evalution words..") np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation( "GIGA", constants.GOOGLE_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) embed_words = set(common_embed.iw) eval_words = eval_words.intersection(embed_words) eval_words = [ word for word in eval_words if not word in positive_seeds and not word in negative_seeds ] print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("SentProp:") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.label_propagate_probabilistic, # method=polarity_induction_methods.bootstrap, beta=0.99, nn=10, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
from socialsent3 import seeds from socialsent3 import lexicons from socialsent3.polarity_induction_methods import random_walk from socialsent3.evaluate_methods import binary_metrics from socialsent3.representations.representation_factory import create_representation if __name__ == "__main__": # print("Evaluting SentProp with 100 dimensional GloVe embeddings") print("Evaluting SentProp with 300 dimensional fastText embeddings") print("Evaluting only binary classification performance on General Inquirer lexicon") lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True) pos_seeds, neg_seeds = seeds.hist_seeds() # embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/glove.6B.100d.txt", # set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/imdb.en.vec", set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) eval_words = [word for word in embeddings.iw if word not in pos_seeds and word not in neg_seeds] # Using SentProp with 10 neighbors and beta=0.99 polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) acc, auc, avg_per = binary_metrics(polarities, lexicon, eval_words) print("Accuracy with best threshold: {:0.2f}".format(acc)) print("ROC AUC: {:0.2f}".format(auc)) print("Average precision score: {:0.2f}".format(avg_per))
def hyperparam_eval(): print("Getting evaluation words and embeddings") lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False) eval_words = set(lexicon.keys()) positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation( "GIGA", constants.COMMON_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) common_words = set(common_embed.iw) eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990") hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) eval_words = [ word for word in eval_words if not word in positive_seeds and not word in negative_seeds ] print("SentProp...") for nn in [5, 10, 25, 50]: for beta in [0.8, 0.9, 0.95, 0.99]: print("Common") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print("Hist") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print("Densify...") for lr in [0.001, 0.01, 0.1, 0.5]: for reg in [0.001, 0.01, 0.1, 0.5]: print("LR : ", lr, "Reg: ", reg) print("Common") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False) print("Hist") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False)
def evaluate_twitter_methods(): np.random.seed(0) print("Getting evaluation words and embeddings..") gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("twitter", remove_neutral=True) scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True) sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False) # padding lexicon with neutral from GI gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice( gi_neut, int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.twitter_seeds() embed = create_representation( "GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) print( len((set(positive_seeds).union(negative_seeds)).intersection( embed.iw))) embed_words = set(embed.iw) s140_words = set(sent140.keys()) eval_words = [ word for word in lexicon if word in s140_words and not word in positive_seeds and not word in negative_seeds and word in embed_words ] print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("Sentiment 140") evaluate(sent140, lexicon, eval_words, tau_lexicon=scores) print() print("SentProp") polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, lr=0.01, regularization_strength=0.5, **DEFAULT_ARGUMENTS) util.write_pickle(polarities, "twitter-test.pkl") evaluate(polarities, lexicon, eval_words, tau_lexicon=scores) print("SentProp") polarities = run_method( positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
def evaluate_finance_methods(): np.random.seed(0) print("Getting evalution words and embeddings..") gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("finance", remove_neutral=True) ### padding in neutrals from GI lexicon gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice( gi_neut, int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.finance_seeds() stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS) stock_counts = create_representation("Explicit", constants.STOCK_COUNTS) common_embed = create_representation( "GIGA", constants.COMMON_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) stock_words = set(stock_embed.iw) common_words = set(common_embed) eval_words = [ word for word in lexicon if word in stock_words and word in common_words and not word in positive_seeds and not word in negative_seeds ] stock_counts = stock_counts.get_subembed( set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False) print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("Velikovich with 1990s Fic embeddings") stock_counts.normalize() polarities = run_method( positive_seeds, negative_seeds, stock_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.graph_propagate, T=3, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=None) print() print("PMI") polarities = run_method(positive_seeds, negative_seeds, stock_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.pmi, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print() print("SentProp with stock embeddings") polarities = run_method( positive_seeds, negative_seeds, stock_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print("Densifier with stock embeddings") polarities = run_method( positive_seeds, negative_seeds, stock_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words)
def evaluate_adj_methods(): """ Evaluate different methods on standard English, but restrict to words that are present in the 1990s portion of historical data. """ print("Getting evalution words and embeddings..") np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) adjs = vocab.pos_words("1990", "ADJ") # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.adj_seeds() common_embed = create_representation( "GIGA", constants.COMMON_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) common_words = set(common_embed.iw) eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000") hist_counts = create_representation("Explicit", constants.COUNTS + "1990", normalize=False) hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) embed_words = [ word for word in adjs if word in hist_words and word in common_words ] eval_words = [ word for word in eval_words if word in embed_words and not word in positive_seeds and not word in negative_seeds ] hist_counts = hist_counts.get_subembed( set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False) print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("Embeddings with ", len(embed_words)) print("PMI") polarities = run_method(positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.pmi, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) print("Dist with 1990s Fic embeddings") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.dist, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("Densifier with 1990s Fic embeddings") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("SentProp with 1990s Fic embeddings") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, nn=25, beta=0.9, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("Velikovich with 1990s Fic embeddings") hist_counts.normalize() polarities = run_method( positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.graph_propagate, T=3, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("SentProp with CC") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.99, nn=10, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print("Densifier with CC") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
def main(args): print('Loading data...') train_sents, train_labels = [], [] with open(args.TRAIN, 'rt') as f: lines = f.readlines() for l in lines: cols = l.split('\t') train_labels.append(1 if cols[0] == 'Positive' else 0) train_sents.append(cols[1].strip()) test_sents, test_labels = [], [] with open(args.TEST, 'rt') as f: lines = f.readlines() for l in lines: cols = l.split('\t') test_labels.append(1 if cols[0] == 'Positive' else 0) test_sents.append(cols[1].strip()) pos_seeds, neg_seeds = seeds.review_seeds() print('Creating word vectors...') embeddings = create_representation("FULL", args.EMBED, 100, limit=50000) print('Calculating polarities...') polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) print('Filtering polarities...') polarities = filter_polarities(polarities, args.CUTOFF) print('Storing polarities...') dict2csv(polarities, path='./data/polarities/filtered.csv') word_list = list(polarities.keys()) train_reps, test_reps = [], [] if args.ALGO == 'tf-idf': tfidf = TfidfVectorizer(vocabulary=word_list, tokenizer=nltk.word_tokenize) print('Creating training sentence representations...') train_reps = tfidf.fit_transform(train_sents) print('Creating testing sentence representations...') test_reps = tfidf.fit_transform(test_sents) else: print('Creating training sentence representations...') for i, sent in enumerate(train_sents): print('\t%d/%d' % (i + 1, len(train_sents)), end='\r') rep = sent2rep(sent, word_list=word_list) train_reps.append(rep) print() print('Creating testing sentence representations...') for i, sent in enumerate(test_sents): print('\t%d/%d' % (i + 1, len(test_sents)), end='\r') rep = sent2rep(sent, word_list=word_list) test_reps.append(rep) print() if args.LSA != 0: print('Transforming w/ LSA...') svd = TruncatedSVD(args.LSA) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) train_reps = lsa.fit_transform(train_reps) test_reps = lsa.fit_transform(test_reps) km = KMeans(n_clusters=2, verbose=1, max_iter=10000) train_preds = km.fit_predict(train_reps) test_preds = km.predict(test_reps) print('\nMetrics on train set:') evaluate(train_labels, train_preds) print('\nMetrics on test set:') evaluate(test_labels, test_preds)