def main(): timeseries = build_timeseries(years=constants.YEARS) #plot_timeseries(timeseries, "awesome") ws = [] p_vs_dp = [] ordering = util.load_pickle(constants.DATA + 'word_ordering.pkl') for w, polarities in timeseries.iteritems(): if ordering.index(w) < 50000: ws.append(w) p_vs_dp.append((np.mean(polarities[:5]), slope(polarities))) xs, ys = zip(*p_vs_dp) pred = LinearRegression().fit(np.matrix(xs).T, ys).predict(np.matrix(xs).T) print "R2 score", r2_score(ys, pred) def onpick(event): for i in event.ind: w = ws[i] print w, p_vs_dp[i] plot_timeseries(timeseries, w) break figure = plt.figure() plt.scatter(xs, ys, marker='o', linewidths=0, picker=True, alpha=1) plt.xlabel('polarity in 1800') plt.ylabel('rate of polarity increase') figure.canvas.mpl_connect('pick_event', onpick) plt.show()
def build_boot_ztimeseries(suffix="-test", years=constants.YEARS): mean_timeseries = defaultdict(list) stderr_timeseries = defaultdict(list) for year in years: polarities_list = util.load_pickle(constants.POLARITIES + year + suffix + '.pkl') means = [ np.mean(polarities.values()) for polarities in polarities_list ] stds = [np.std(polarities.values()) for polarities in polarities_list] zscore = lambda i, val: (val - means[i]) / stds[i] for w in polarities_list[0]: mean_timeseries[w].append( np.mean([ zscore(i, polarities[w]) for i, polarities in enumerate(polarities_list) ])) stderr_timeseries[w].append( np.std([ zscore(i, polarities[w]) for i, polarities in enumerate(polarities_list) ])) for w in mean_timeseries.keys(): if len(mean_timeseries[w]) < 5: # #print w + " is not present in all decades" del mean_timeseries[w] del stderr_timeseries[w] return mean_timeseries, stderr_timeseries
def pos_tags(year): """ Returns mapping from words to POS tags for year. AMB means tag is ambiguous. """ year = str(year) pos_tags = util.load_pickle(constants.POS + year + "-pos.pkl") return pos_tags
def pos_tags(year): """ Returns mapping from words to POS tags for year. AMB means tag is ambiguous. """ year = str(year) pos_tags = util.load_pickle(constants.POS + year + "-pos.pkl") return pos_tags
def get_boot_meanseriess(suffix="-test", years=constants.YEARS): mean_timeseriess = defaultdict(list) for year in years: polarities_list = util.load_pickle(constants.POLARITIES + year + suffix + '.pkl') for i, polarities in enumerate(polarities_list): mean_timeseriess[i].append(np.mean(polarities.values())) return mean_timeseriess
def load_vocabulary(mat, path): if os.path.isfile(path.split(".")[0] + "-index.pkl"): path = path.split(".")[0] + "-index.pkl" else: print "Could not find local index. Attempting to load directory wide index..." path = "/".join(path.split("/")[:-1]) + "/index.pkl" index = util.load_pickle(path) vocab = sorted(index, key = lambda word : index[word]) iw = vocab[:mat.shape[0]] ic = vocab[:mat.shape[1]] return iw, ic
def main(subreddit): const = get_constants(subreddit) if os.path.exists(const['CORPUS']): print("Loading preexisting corpus...") corpus = util.load_pickle(const['CORPUS']) else: print("Getting and writing dictionary...") with open(const['OUTPUTS'], "r") as f: num_lines = sum(1 for line in f) with open(const['OUTPUTS'], "r") as f: dicts = (json.loads(comment) for comment in tqdm(f, total=num_lines)) if const["INTERVAL"] is not None: corpuses = [[] for interval in const["ALL_INTERVALS"]] for comment in dicts: i = get_interval_idx(comment["score"]) corpuses[i].append(normalize_text(comment["body"], const['STEMMING'])) for i, interval in enumerate(const["ALL_INTERVALS"]): util.write_pickle(corpuses[i], get_interval_fname(subreddit, interval)) corpus = corpuses[0] else: corpus = [normalize_text(comment["body"], const['STEMMING']) for comment in dicts] gdict = Dictionary( corpus ) gdict.filter_extremes(no_above=const['NO_ABOVE_1'], no_below=const['NO_BELOW']) gdict.compactify() util.write_pickle(gdict.token2id, const['INDICES']) util.write_pickle(gdict, const['DICTS']) print("Generating word co-occurrences...") cooccurgen.run( word_gen(corpus, gdict, subreddit, len(corpus)), gdict.token2id, 4, const['COUNTS'] ) print("Generating PPMI vectors...") ppmigen.run(subreddit, cds=True) print("Generating SVD vectors...") makelowdim.run(const['INDICES'], const['PPMI'], const['VECS'])
def load_vocabulary(mat, path): if os.path.isfile(path.split(".")[0] + "-index.pkl"): path = path.split(".")[0] + "-index.pkl" else: print("Could not find local index. Attempting to load directory wide index...") path = "/".join(path.split("/")[:-1]) + "/index.pkl" index = util.load_pickle(path) vocab = sorted(index, key = lambda word : index[word]) iw = vocab[:mat.shape[0]] ic = vocab[:mat.shape[1]] return iw, ic
def get_boot_meanseries(suffix="-test", years=constants.YEARS): mean_timeseries = [] stderr_timeseries = [] for year in years: polarities_list = util.load_pickle(constants.POLARITIES + year + suffix + '.pkl') year_means = [] for polarities in polarities_list: year_means.append(np.mean(polarities.values())) mean_timeseries.append(np.mean(year_means)) stderr_timeseries.append(np.std(year_means)) return mean_timeseries, stderr_timeseries
def build_dictseries(raw=True, suffix="-test", years=constants.YEARS): timeseries = defaultdict(lambda: defaultdict(lambda: float('nan'))) for year in years: polarities = util.load_pickle(constants.POLARITIES + year + suffix + '.pkl') for i, (w, p) in enumerate(sorted(polarities.items(), key=itemgetter(1))): if not raw: polarities[w] = i / float(len(polarities)) else: polarities[w] = p for w, p in polarities.iteritems(): timeseries[w][int(year)] = p return timeseries
def main(subreddit): out_path = OUT.format(subreddit) util.mkdir(out_path) print "Getting and writing dictionary..." gdict = util.load_pickle(DICTS.format(subreddit)) gdict.filter_extremes(no_above=0.5, no_below=100) gdict.compactify() util.write_pickle(gdict.token2id, out_path + "index.pkl") print "Generating word co-occurrences..." cooccurgen.run(word_gen(COMMENTS.format(subreddit), gdict), gdict.token2id, 4, out_path + "counts.bin") print "Generating PPMI vectors..." ppmigen.run(out_path + "counts.bin", out_path + "ppmi", cds=True) print "Generating SVD vectors..." makelowdim.run(out_path + "ppmi.bin", out_path + "vecs")
def build_boot_timeseries(suffix="-test", years=constants.YEARS): mean_timeseries = defaultdict(list) stderr_timeseries = defaultdict(list) for year in years: polarities_list = util.load_pickle(constants.POLARITIES + year + suffix + '.pkl') for w in polarities_list[0]: n = float(len(polarities_list)) mean_timeseries[w].append( np.mean([polarities[w] for polarities in polarities_list])) stderr_timeseries[w].append( np.std([polarities[w] for polarities in polarities_list]) / n) for w in mean_timeseries.keys(): if len(mean_timeseries[w]) < 5: del mean_timeseries[w] del stderr_timeseries[w] return mean_timeseries, stderr_timeseries
def build_timeseries(raw=False, suffix="-test", years=constants.YEARS): timeseries = defaultdict(list) for year in years: polarities = util.load_pickle(constants.POLARITIES + year + suffix + '.pkl') for i, (w, p) in enumerate(sorted(polarities.items(), key=itemgetter(1))): if not raw: polarities[w] = i / float(len(polarities)) else: polarities[w] = p for w, p in polarities.iteritems(): timeseries[w].append(p) for w in timeseries.keys(): if len(timeseries[w]) < 5: del timeseries[w] return timeseries
def __init__(self, path, normalize=True, eig=0.0, **kwargs): ut = np.load(path + '-u.npy') s = np.load(path + '-s.npy') vocabfile = path + '-vocab.pkl' self.iw = load_pickle(vocabfile) self.wi = {w:i for i, w in enumerate(self.iw)} if eig == 0.0: self.m = ut elif eig == 1.0: self.m = s * ut else: self.m = np.power(s, eig) * ut self.dim = self.m.shape[1] if normalize: self.normalize()
def __init__(self, path, normalize=True, eig=0.0, **kwargs): ut = np.load(path + '-u.npy') s = np.load(path + '-s.npy') vocabfile = path + '-vocab.pkl' self.iw = load_pickle(vocabfile) self.wi = {w: i for i, w in enumerate(self.iw)} if eig == 0.0: self.m = ut elif eig == 1.0: self.m = s * ut else: self.m = np.power(s, eig) * ut self.dim = self.m.shape[1] if normalize: self.normalize()
def main(subreddit): const = get_constants(subreddit) word_dict = util.load_pickle(const['DICTS']) word_dict.filter_extremes(no_above=const['NO_ABOVE_2'], no_below=const['NO_BELOW']) to_keep = sorted(word_dict.dfs, key=lambda w: word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) print("Create representation...") sub_vecs = create_representation('SVD', const['VECS']) if const["GENDER"]: pos_seeds, neg_seeds = seeds.gender_seeds() else: pos_seeds, neg_seeds = seeds.twitter_seeds() pos_seeds = list( set(subredditgen.normalize_text(' '.join(pos_seeds), const['STEMMING']))) neg_seeds = list( set(subredditgen.normalize_text(' '.join(neg_seeds), const['STEMMING']))) print("Get sub embedding...") sub_vecs = sub_vecs.get_subembed( set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) print("Bootstrapping...") print("using seeds {} {}".format(pos_seeds, neg_seeds)) pols = polarity_induction_methods.bootstrap( sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, boot_size=len(pos_seeds) - 2, num_boots=30, n_procs=10, ) util.write_pickle(pols, const['POLARITIES'])
def build_boot_zdictseries(suffix="-test", years=constants.YEARS): mean_timeseries = defaultdict(lambda: defaultdict(lambda: float('nan'))) stderr_timeseries = defaultdict(lambda: defaultdict(lambda: float('nan'))) for year in years: polarities_list = util.load_pickle(constants.POLARITIES + year + suffix + '.pkl') means = [ np.mean(polarities.values()) for polarities in polarities_list ] stds = [np.std(polarities.values()) for polarities in polarities_list] zscore = lambda i, val: (val - means[i]) / stds[i] for w in polarities_list[0]: mean_timeseries[w][year] = (np.mean([ zscore(i, polarities[w]) for i, polarities in enumerate(polarities_list) ])) stderr_timeseries[w][year] = (np.std([ zscore(i, polarities[w]) for i, polarities in enumerate(polarities_list) ])) return mean_timeseries, stderr_timeseries
import numpy as np from sklearn.utils.extmath import randomized_svd from socialsent import util from socialsent.representations.explicit import Explicit def run(in_file, out_path, dim=300, keep_words=None): base_embed = Explicit.load(in_file, normalize=False) if keep_words != None: base_embed = base_embed.get_subembed(keep_words) u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) np.save(out_path + "-u.npy", u) np.save(out_path + "-v.npy", v) np.save(out_path + "-s.npy", s) util.write_pickle(base_embed.iw, out_path + "-vocab.pkl") if __name__ == '__main__': print "Getting keep words..." counts = util.load_pickle( "/dfs/scratch0/COHA/cooccurs/lemma/1990-counts.pkl") keep_words = [word for word in counts if counts[word] >= 100] print "Running SVD.." run("/dfs/scratch0/COHA/cooccurs/lemma/testb-0-ppmi.bin.bin", "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-svd", keep_words=keep_words)
indices.extend(range(window_size + 1, 2 * window_size + 1)) for i in indices: if i >= len(context): break pair_counts[(target, context[i])] += 1 return pair_counts class COHAWordGen(object): def __init__(self, off, index): self.data_dir = "/dfs/scratch0/COHA/COHA_word_lemma_pos/1990/" self.off = off self.index = index def __iter__(self): for j, fname in enumerate(os.listdir(self.data_dir)): if j % 2 == self.off: continue print fname for i, line in enumerate(open(os.path.join(self.data_dir, fname))): word = line.split()[1].lower() if word in index: yield word if __name__ == "__main__": index = util.load_pickle("/dfs/scratch0/COHA/cooccurs/lemma/4/index.pkl") word_gen = COHAWordGen(0, index) run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-counts.bin") # word_gen = COHAWordGen(1, index) # run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/test-1-counts.bin")
def load_vocabulary(mat, path): index = util.load_pickle(path) vocab = sorted(index, key=lambda word: index[word]) iw = vocab[:mat.shape[0]] ic = vocab[:mat.shape[1]] return iw, ic
def words_above_freq(year, freq): freqs = util.load_pickle(constants.COHA_FREQS.format(year)) return set([word for word in freqs if freqs[word] > freq])
def top_words(year, rank): year = int(year) freqs = util.load_pickle(constants.FREQS) return set(sorted(freqs, key = lambda w : freqs[w][year], reverse=True)[0:rank])
def words_above_freq(year, freq): freqs = util.load_pickle(constants.COHA_FREQS.format(year)) return set([word for word in freqs if freqs[word] > freq])
def top_words(year, rank): year = int(year) freqs = util.load_pickle(constants.FREQS) return set( sorted(freqs, key=lambda w: freqs[w][year], reverse=True)[0:rank])
import numpy as np from sklearn.utils.extmath import randomized_svd from socialsent import util from socialsent.representations.explicit import Explicit def run(in_file, out_path, dim=300, keep_words=None): base_embed = Explicit.load(in_file, normalize=False) if keep_words != None: base_embed = base_embed.get_subembed(keep_words) u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) np.save(out_path + "-u.npy", u) np.save(out_path + "-v.npy", v) np.save(out_path + "-s.npy", s) util.write_pickle(base_embed.iw, out_path + "-vocab.pkl") if __name__ == '__main__': print "Getting keep words..." counts = util.load_pickle("/dfs/scratch0/COHA/cooccurs/lemma/1990-counts.pkl") keep_words = [word for word in counts if counts[word] >= 100] print "Running SVD.." run("/dfs/scratch0/COHA/cooccurs/lemma/testb-0-ppmi.bin.bin", "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-svd", keep_words=keep_words)
def load(cls, path, normalize=True, add_context=True, **kwargs): mat = np.load(path + "-w.npy") if add_context: mat += np.load(path + "-c.npy") iw = load_pickle(path + "-vocab.pkl") return cls(mat, iw, normalize)
indices.extend(range(window_size + 1, 2 * window_size + 1)) for i in indices: if i >= len(context): break pair_counts[(target, context[i])] += 1 return pair_counts class COHAWordGen(object): def __init__(self, off, index): self.data_dir = "/dfs/scratch0/COHA/COHA_word_lemma_pos/1990/" self.off = off self.index = index def __iter__(self): for j, fname in enumerate(os.listdir(self.data_dir)): if j % 2 == self.off: continue print(fname) for i, line in enumerate(open(os.path.join(self.data_dir, fname))): word = line.split()[1].lower() if word in index: yield word if __name__ == "__main__": index = util.load_pickle("/dfs/scratch0/COHA/cooccurs/lemma/4/index.pkl") word_gen = COHAWordGen(0, index) run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-counts.bin") # word_gen = COHAWordGen(1, index) # run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/test-1-counts.bin")
def load(cls, path, normalize=True, add_context=True, **kwargs): mat = np.load(path + "-w.npy") if add_context: mat += np.load(path + "-c.npy") iw = load_pickle(path + "-vocab.pkl") return cls(mat, iw, normalize)