Exemple #1
0
def main():
    timeseries = build_timeseries(years=constants.YEARS)
    #plot_timeseries(timeseries, "awesome")

    ws = []
    p_vs_dp = []
    ordering = util.load_pickle(constants.DATA + 'word_ordering.pkl')
    for w, polarities in timeseries.iteritems():
        if ordering.index(w) < 50000:
            ws.append(w)
            p_vs_dp.append((np.mean(polarities[:5]), slope(polarities)))

    xs, ys = zip(*p_vs_dp)
    pred = LinearRegression().fit(np.matrix(xs).T, ys).predict(np.matrix(xs).T)
    print "R2 score", r2_score(ys, pred)

    def onpick(event):
        for i in event.ind:
            w = ws[i]
            print w, p_vs_dp[i]
            plot_timeseries(timeseries, w)
            break

    figure = plt.figure()
    plt.scatter(xs, ys, marker='o', linewidths=0, picker=True, alpha=1)
    plt.xlabel('polarity in 1800')
    plt.ylabel('rate of polarity increase')
    figure.canvas.mpl_connect('pick_event', onpick)
    plt.show()
Exemple #2
0
def build_boot_ztimeseries(suffix="-test", years=constants.YEARS):
    mean_timeseries = defaultdict(list)
    stderr_timeseries = defaultdict(list)
    for year in years:
        polarities_list = util.load_pickle(constants.POLARITIES + year +
                                           suffix + '.pkl')
        means = [
            np.mean(polarities.values()) for polarities in polarities_list
        ]
        stds = [np.std(polarities.values()) for polarities in polarities_list]
        zscore = lambda i, val: (val - means[i]) / stds[i]
        for w in polarities_list[0]:
            mean_timeseries[w].append(
                np.mean([
                    zscore(i, polarities[w])
                    for i, polarities in enumerate(polarities_list)
                ]))
            stderr_timeseries[w].append(
                np.std([
                    zscore(i, polarities[w])
                    for i, polarities in enumerate(polarities_list)
                ]))
    for w in mean_timeseries.keys():
        if len(mean_timeseries[w]) < 5:
            #            #print w + " is not present in all decades"
            del mean_timeseries[w]
            del stderr_timeseries[w]
    return mean_timeseries, stderr_timeseries
Exemple #3
0
def pos_tags(year):
    """
    Returns mapping from words to POS tags for year.
    AMB means tag is ambiguous.
    """
    year = str(year)
    pos_tags = util.load_pickle(constants.POS + year + "-pos.pkl")
    return pos_tags
Exemple #4
0
def pos_tags(year):
    """
    Returns mapping from words to POS tags for year.
    AMB means tag is ambiguous.
    """
    year = str(year)
    pos_tags = util.load_pickle(constants.POS + year + "-pos.pkl")
    return pos_tags
Exemple #5
0
def get_boot_meanseriess(suffix="-test", years=constants.YEARS):
    mean_timeseriess = defaultdict(list)
    for year in years:
        polarities_list = util.load_pickle(constants.POLARITIES + year +
                                           suffix + '.pkl')
        for i, polarities in enumerate(polarities_list):
            mean_timeseriess[i].append(np.mean(polarities.values()))
    return mean_timeseriess
def load_vocabulary(mat, path):
    if os.path.isfile(path.split(".")[0] + "-index.pkl"):
        path = path.split(".")[0] + "-index.pkl"
    else:
        print "Could not find local index. Attempting to load directory wide index..."
        path = "/".join(path.split("/")[:-1]) + "/index.pkl"
    index = util.load_pickle(path)
    vocab = sorted(index, key = lambda word : index[word])
    iw = vocab[:mat.shape[0]]
    ic = vocab[:mat.shape[1]]
    return iw, ic
Exemple #7
0
def main(subreddit):
    const = get_constants(subreddit)

    if os.path.exists(const['CORPUS']):
        print("Loading preexisting corpus...")
        corpus = util.load_pickle(const['CORPUS'])
    else:
        print("Getting and writing dictionary...")

        with open(const['OUTPUTS'], "r") as f:
            num_lines = sum(1 for line in f)


        with open(const['OUTPUTS'], "r") as f:
            dicts = (json.loads(comment) for comment in tqdm(f, total=num_lines))

            if const["INTERVAL"] is not None:
                corpuses = [[] for interval in const["ALL_INTERVALS"]]

                for comment in dicts:
                    i = get_interval_idx(comment["score"])
                    corpuses[i].append(normalize_text(comment["body"], const['STEMMING']))

                for i, interval in enumerate(const["ALL_INTERVALS"]):
                    util.write_pickle(corpuses[i], get_interval_fname(subreddit, interval))

                corpus = corpuses[0]
            else:
                corpus = [normalize_text(comment["body"], const['STEMMING']) for comment in dicts]

                
    gdict = Dictionary(
        corpus
    )

    gdict.filter_extremes(no_above=const['NO_ABOVE_1'], no_below=const['NO_BELOW'])
    gdict.compactify()


    util.write_pickle(gdict.token2id, const['INDICES'])
    util.write_pickle(gdict, const['DICTS'])


    print("Generating word co-occurrences...")
    cooccurgen.run(
       word_gen(corpus, gdict, subreddit, len(corpus)),
       gdict.token2id,
       4,
       const['COUNTS']
    )
    print("Generating PPMI vectors...")
    ppmigen.run(subreddit, cds=True)
    print("Generating SVD vectors...")
    makelowdim.run(const['INDICES'], const['PPMI'], const['VECS'])
def load_vocabulary(mat, path):
    if os.path.isfile(path.split(".")[0] + "-index.pkl"):
        path = path.split(".")[0] + "-index.pkl"
    else:
        print("Could not find local index. Attempting to load directory wide index...")
        path = "/".join(path.split("/")[:-1]) + "/index.pkl"
    index = util.load_pickle(path)
    vocab = sorted(index, key = lambda word : index[word])
    iw = vocab[:mat.shape[0]]
    ic = vocab[:mat.shape[1]]
    return iw, ic
Exemple #9
0
def get_boot_meanseries(suffix="-test", years=constants.YEARS):
    mean_timeseries = []
    stderr_timeseries = []
    for year in years:
        polarities_list = util.load_pickle(constants.POLARITIES + year +
                                           suffix + '.pkl')
        year_means = []
        for polarities in polarities_list:
            year_means.append(np.mean(polarities.values()))
        mean_timeseries.append(np.mean(year_means))
        stderr_timeseries.append(np.std(year_means))
    return mean_timeseries, stderr_timeseries
Exemple #10
0
def build_dictseries(raw=True, suffix="-test", years=constants.YEARS):
    timeseries = defaultdict(lambda: defaultdict(lambda: float('nan')))
    for year in years:
        polarities = util.load_pickle(constants.POLARITIES + year + suffix +
                                      '.pkl')
        for i, (w,
                p) in enumerate(sorted(polarities.items(), key=itemgetter(1))):
            if not raw:
                polarities[w] = i / float(len(polarities))
            else:
                polarities[w] = p

        for w, p in polarities.iteritems():
            timeseries[w][int(year)] = p
    return timeseries
Exemple #11
0
def main(subreddit):
    out_path = OUT.format(subreddit)
    util.mkdir(out_path)

    print "Getting and writing dictionary..."
    gdict = util.load_pickle(DICTS.format(subreddit))
    gdict.filter_extremes(no_above=0.5, no_below=100)
    gdict.compactify()
    util.write_pickle(gdict.token2id, out_path + "index.pkl")

    print "Generating word co-occurrences..."
    cooccurgen.run(word_gen(COMMENTS.format(subreddit), gdict), gdict.token2id, 4, out_path + "counts.bin")
    print "Generating PPMI vectors..."
    ppmigen.run(out_path + "counts.bin", out_path + "ppmi", cds=True)
    print "Generating SVD vectors..."
    makelowdim.run(out_path + "ppmi.bin", out_path + "vecs")
Exemple #12
0
def build_boot_timeseries(suffix="-test", years=constants.YEARS):
    mean_timeseries = defaultdict(list)
    stderr_timeseries = defaultdict(list)
    for year in years:
        polarities_list = util.load_pickle(constants.POLARITIES + year +
                                           suffix + '.pkl')
        for w in polarities_list[0]:
            n = float(len(polarities_list))
            mean_timeseries[w].append(
                np.mean([polarities[w] for polarities in polarities_list]))
            stderr_timeseries[w].append(
                np.std([polarities[w] for polarities in polarities_list]) / n)
    for w in mean_timeseries.keys():
        if len(mean_timeseries[w]) < 5:
            del mean_timeseries[w]
            del stderr_timeseries[w]
    return mean_timeseries, stderr_timeseries
Exemple #13
0
def build_timeseries(raw=False, suffix="-test", years=constants.YEARS):
    timeseries = defaultdict(list)
    for year in years:
        polarities = util.load_pickle(constants.POLARITIES + year + suffix +
                                      '.pkl')
        for i, (w,
                p) in enumerate(sorted(polarities.items(), key=itemgetter(1))):
            if not raw:
                polarities[w] = i / float(len(polarities))
            else:
                polarities[w] = p

        for w, p in polarities.iteritems():
            timeseries[w].append(p)
    for w in timeseries.keys():
        if len(timeseries[w]) < 5:
            del timeseries[w]
    return timeseries
    def __init__(self, path, normalize=True, eig=0.0, **kwargs):
        ut = np.load(path + '-u.npy')
        s = np.load(path + '-s.npy')
        vocabfile = path + '-vocab.pkl'
        self.iw = load_pickle(vocabfile)
        self.wi = {w:i for i, w in enumerate(self.iw)}
 
        if eig == 0.0:
            self.m = ut
        elif eig == 1.0:
            self.m = s * ut
        else:
            self.m = np.power(s, eig) * ut

        self.dim = self.m.shape[1]

        if normalize:
            self.normalize()
Exemple #15
0
    def __init__(self, path, normalize=True, eig=0.0, **kwargs):
        ut = np.load(path + '-u.npy')
        s = np.load(path + '-s.npy')
        vocabfile = path + '-vocab.pkl'
        self.iw = load_pickle(vocabfile)
        self.wi = {w: i for i, w in enumerate(self.iw)}

        if eig == 0.0:
            self.m = ut
        elif eig == 1.0:
            self.m = s * ut
        else:
            self.m = np.power(s, eig) * ut

        self.dim = self.m.shape[1]

        if normalize:
            self.normalize()
Exemple #16
0
def main(subreddit):
    const = get_constants(subreddit)

    word_dict = util.load_pickle(const['DICTS'])
    word_dict.filter_extremes(no_above=const['NO_ABOVE_2'],
                              no_below=const['NO_BELOW'])
    to_keep = sorted(word_dict.dfs,
                     key=lambda w: word_dict.dfs[w],
                     reverse=True)[:5000]
    word_dict.filter_tokens(good_ids=to_keep)

    print("Create representation...")
    sub_vecs = create_representation('SVD', const['VECS'])
    if const["GENDER"]:
        pos_seeds, neg_seeds = seeds.gender_seeds()
    else:
        pos_seeds, neg_seeds = seeds.twitter_seeds()

    pos_seeds = list(
        set(subredditgen.normalize_text(' '.join(pos_seeds),
                                        const['STEMMING'])))
    neg_seeds = list(
        set(subredditgen.normalize_text(' '.join(neg_seeds),
                                        const['STEMMING'])))

    print("Get sub embedding...")
    sub_vecs = sub_vecs.get_subembed(
        set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))

    print("Bootstrapping...")
    print("using seeds {} {}".format(pos_seeds, neg_seeds))
    pols = polarity_induction_methods.bootstrap(
        sub_vecs,
        pos_seeds,
        neg_seeds,
        return_all=True,
        nn=25,
        beta=0.9,
        boot_size=len(pos_seeds) - 2,
        num_boots=30,
        n_procs=10,
    )

    util.write_pickle(pols, const['POLARITIES'])
Exemple #17
0
def build_boot_zdictseries(suffix="-test", years=constants.YEARS):
    mean_timeseries = defaultdict(lambda: defaultdict(lambda: float('nan')))
    stderr_timeseries = defaultdict(lambda: defaultdict(lambda: float('nan')))
    for year in years:
        polarities_list = util.load_pickle(constants.POLARITIES + year +
                                           suffix + '.pkl')
        means = [
            np.mean(polarities.values()) for polarities in polarities_list
        ]
        stds = [np.std(polarities.values()) for polarities in polarities_list]
        zscore = lambda i, val: (val - means[i]) / stds[i]
        for w in polarities_list[0]:
            mean_timeseries[w][year] = (np.mean([
                zscore(i, polarities[w])
                for i, polarities in enumerate(polarities_list)
            ]))
            stderr_timeseries[w][year] = (np.std([
                zscore(i, polarities[w])
                for i, polarities in enumerate(polarities_list)
            ]))
    return mean_timeseries, stderr_timeseries
Exemple #18
0
import numpy as np

from sklearn.utils.extmath import randomized_svd
from socialsent import util
from socialsent.representations.explicit import Explicit


def run(in_file, out_path, dim=300, keep_words=None):
    base_embed = Explicit.load(in_file, normalize=False)
    if keep_words != None:
        base_embed = base_embed.get_subembed(keep_words)
    u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
    np.save(out_path + "-u.npy", u)
    np.save(out_path + "-v.npy", v)
    np.save(out_path + "-s.npy", s)
    util.write_pickle(base_embed.iw, out_path + "-vocab.pkl")


if __name__ == '__main__':
    print "Getting keep words..."
    counts = util.load_pickle(
        "/dfs/scratch0/COHA/cooccurs/lemma/1990-counts.pkl")
    keep_words = [word for word in counts if counts[word] >= 100]
    print "Running SVD.."
    run("/dfs/scratch0/COHA/cooccurs/lemma/testb-0-ppmi.bin.bin",
        "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-svd",
        keep_words=keep_words)
    indices.extend(range(window_size + 1, 2 * window_size + 1))
    for i in indices:
        if i >= len(context):
            break
        pair_counts[(target, context[i])] += 1
    return pair_counts

class COHAWordGen(object):

    def __init__(self, off, index):
        self.data_dir = "/dfs/scratch0/COHA/COHA_word_lemma_pos/1990/"
        self.off = off
        self.index = index

    def __iter__(self):
        for j, fname in enumerate(os.listdir(self.data_dir)):
            if j % 2 == self.off:
                continue
            print fname
            for i, line in enumerate(open(os.path.join(self.data_dir, fname))):
                word = line.split()[1].lower()
                if word in index:
                    yield word

if __name__ == "__main__":
    index = util.load_pickle("/dfs/scratch0/COHA/cooccurs/lemma/4/index.pkl")
    word_gen = COHAWordGen(0, index)
    run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-counts.bin")
#    word_gen = COHAWordGen(1, index)
#    run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/test-1-counts.bin")
def load_vocabulary(mat, path):
    index = util.load_pickle(path)
    vocab = sorted(index, key=lambda word: index[word])
    iw = vocab[:mat.shape[0]]
    ic = vocab[:mat.shape[1]]
    return iw, ic
Exemple #21
0
def words_above_freq(year, freq):
    freqs = util.load_pickle(constants.COHA_FREQS.format(year)) 
    return set([word for word in freqs if freqs[word] > freq])
Exemple #22
0
def top_words(year, rank):
    year = int(year)
    freqs = util.load_pickle(constants.FREQS) 
    return set(sorted(freqs, key = lambda w : freqs[w][year], reverse=True)[0:rank])
Exemple #23
0
def words_above_freq(year, freq):
    freqs = util.load_pickle(constants.COHA_FREQS.format(year))
    return set([word for word in freqs if freqs[word] > freq])
Exemple #24
0
def top_words(year, rank):
    year = int(year)
    freqs = util.load_pickle(constants.FREQS)
    return set(
        sorted(freqs, key=lambda w: freqs[w][year], reverse=True)[0:rank])
import numpy as np

from sklearn.utils.extmath import randomized_svd
from socialsent import util
from socialsent.representations.explicit import Explicit

def run(in_file, out_path, dim=300, keep_words=None): 
        base_embed = Explicit.load(in_file, normalize=False)
        if keep_words != None:
            base_embed = base_embed.get_subembed(keep_words)
        u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
        np.save(out_path + "-u.npy", u)
        np.save(out_path + "-v.npy", v)
        np.save(out_path + "-s.npy", s)
        util.write_pickle(base_embed.iw, out_path  + "-vocab.pkl")

if __name__ == '__main__':
    print "Getting keep words..."
    counts = util.load_pickle("/dfs/scratch0/COHA/cooccurs/lemma/1990-counts.pkl") 
    keep_words = [word for word in counts if counts[word] >= 100]
    print "Running SVD.."
    run("/dfs/scratch0/COHA/cooccurs/lemma/testb-0-ppmi.bin.bin", "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-svd", keep_words=keep_words)
    
 def load(cls, path, normalize=True, add_context=True, **kwargs):
     mat = np.load(path + "-w.npy")
     if add_context:
         mat += np.load(path + "-c.npy")
     iw = load_pickle(path + "-vocab.pkl")
     return cls(mat, iw, normalize) 
Exemple #27
0
    indices.extend(range(window_size + 1, 2 * window_size + 1))
    for i in indices:
        if i >= len(context):
            break
        pair_counts[(target, context[i])] += 1
    return pair_counts

class COHAWordGen(object):

    def __init__(self, off, index):
        self.data_dir = "/dfs/scratch0/COHA/COHA_word_lemma_pos/1990/"
        self.off = off
        self.index = index

    def __iter__(self):
        for j, fname in enumerate(os.listdir(self.data_dir)):
            if j % 2 == self.off:
                continue
            print(fname)
            for i, line in enumerate(open(os.path.join(self.data_dir, fname))):
                word = line.split()[1].lower()
                if word in index:
                    yield word

if __name__ == "__main__":
    index = util.load_pickle("/dfs/scratch0/COHA/cooccurs/lemma/4/index.pkl")
    word_gen = COHAWordGen(0, index)
    run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/testb-0-counts.bin")
#    word_gen = COHAWordGen(1, index)
#    run(word_gen, index, 4, "/dfs/scratch0/COHA/cooccurs/lemma/test-1-counts.bin")
Exemple #28
0
 def load(cls, path, normalize=True, add_context=True, **kwargs):
     mat = np.load(path + "-w.npy")
     if add_context:
         mat += np.load(path + "-c.npy")
     iw = load_pickle(path + "-vocab.pkl")
     return cls(mat, iw, normalize)