Exemple #1
0
def worker(proc_num, queue, out_dir, in_dir):
    while True:
        try:
            decade = queue.get(block=False)
        except Empty:
            break

        print "Processing decade", decade
        counts = collections.defaultdict(int)       
        for year in range(10):
            embed = Explicit.load(in_dir + str(decade + year) + ".bin", normalize=False)
            if year == 0:
                merged_index = embed.wi
            year_list = load_pickle(in_dir + str(decade + year) + "-list.pkl")
            mat = embed.m.tocoo()
            for i in xrange(len(mat.data)):
                if mat.data[i] == 0:
                    continue
                new_row = get_index(merged_index, year_list, mat.row[i])
                new_col = get_index(merged_index, year_list, mat.col[i])
                counts[(new_row, new_col)] += mat.data[i]
            print "Done year ", decade + year
        export_mat_from_dict(counts, decade, out_dir)
        write_pickle(merged_index, out_dir + str(decade) + "-index.pkl")
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")
Exemple #2
0
def worker(proc_num,
           queue,
           out_dir,
           in_dir,
           count_dir,
           valid_words,
           num_words,
           min_count,
           sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False)
        year_words = valid_words[year][:num_words]
        count_words = set(ioutils.words_above_count(count_dir, year,
                                                    min_count))
        freq = CachedFreqDist(
            ioutils.load_pickle(count_dir + str(year) + "-counts.pkl"))
        use_words = list(count_words.intersection(year_words))
        embed = embed.get_subembed(use_words, restrict_context=True)
        sample_corr = min(SAMPLE_MAX / freq.N(), 1.0)
        print "Sample correction..", sample_corr
        embed.m = embed.m * sample_corr
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(mat.data)):
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0)
                    prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0)
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(int(mat.data[i] * prop_keep)):
                    fp.write(line)
        mat = mat.tocsr()
        print proc_num, "Outputing vocab for year", year
        with open(out_dir + str(year) + ".vocab", "w") as fp:
            for word in year_words:
                if not word in count_words:
                    print >> fp, word.encode("utf-8"), 1
                else:
                    print >> fp, word.encode("utf-8"), int(
                        mat[embed.wi[word], :].sum())
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(
            year) + ".txt"
        os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " +
                  out_dir + str(year) + ".txt")
        os.remove(out_dir + str(year) + ".tmp.txt")
Exemple #3
0
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False)
        year_words = valid_words[year][:num_words]
        count_words = set(ioutils.words_above_count(count_dir, year, min_count))
        freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl"))
        use_words = list(count_words.intersection(year_words)) 
        embed = embed.get_subembed(use_words, restrict_context=True)
        sample_corr = min(SAMPLE_MAX / freq.N(), 1.0)
        print "Sample correction..", sample_corr
        embed.m = embed.m * sample_corr
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(mat.data)): 
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) 
                    prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) 
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(int(mat.data[i] * prop_keep)):
                    fp.write(line)
        mat = mat.tocsr()
        print proc_num, "Outputing vocab for year", year
        with open(out_dir + str(year) + ".vocab", "w") as fp:
            for word in year_words:
                if not word in count_words:
                    print >>fp, word.encode("utf-8"), 1
                else:
                    print >>fp, word.encode("utf-8"), int(mat[embed.wi[word], :].sum())
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" 
        os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt")
        os.remove(out_dir + str(year) + ".tmp.txt")
 def __init__(self, input_dir,  years, top_freq=None, normalize=True):
     self.embeds = collections.OrderedDict()
     for year in years:
         self.embeds[year] = Explicit.load(input_dir + "/" + str(year) + ".bin", normalize=normalize)
Exemple #5
0
import sys

import numpy as np
from sklearn.decomposition import TruncatedSVD

from googlengram import util
from vecanalysis.representations.explicit import Explicit

INPUT_DIR = "/dfs/scratch0/google_ngrams/5grams_ppmi_lsmooth_fixed/"
OUTPUT_DIR = "/dfs/scratch0/google_ngrams/vecs-svd/"
INPUT_PATH = INPUT_DIR + "{year}.bin"
OUTPUT_PATH = OUTPUT_DIR + "{year}-300vecs"

if __name__ == "__main__":
    year = sys.argv[1]
    print "Loading embeddings for year", year
    words = util.load_pickle("/dfs/scratch0/google_ngrams/info/interestingwords.pkl")
    base_embed = Explicit.load(INPUT_PATH.format(year=year), restricted_context=words)
    print "SVD for year", year
    pca = TruncatedSVD(n_components=300)
    new_mat = pca.fit_transform(base_embed.m)
    print "Saving year", year
    np.save(OUTPUT_PATH.format(year=year) + ".npy", new_mat)
    vocab_outfp = open(OUTPUT_PATH.format(year=year) + ".vocab", "w")
    words = [word.encode("utf-8") for word in base_embed.iw]
    vocab_outfp.write(" ".join(words))
 def __init__(self, input_dir, years, top_freq=None, normalize=True):
     self.embeds = collections.OrderedDict()
     for year in years:
         self.embeds[year] = Explicit.load(input_dir + "/" + str(year) +
                                           ".bin",
                                           normalize=normalize)
Exemple #7
0
import sys

import numpy as np
from sklearn.decomposition import TruncatedSVD

from googlengram import util
from vecanalysis.representations.explicit import Explicit

INPUT_DIR = "/dfs/scratch0/google_ngrams/5grams_ppmi_lsmooth_fixed/"
OUTPUT_DIR = "/dfs/scratch0/google_ngrams/vecs-svd/"
INPUT_PATH = INPUT_DIR + '{year}.bin'
OUTPUT_PATH = OUTPUT_DIR + '{year}-300vecs'

if __name__ == '__main__':
    year = sys.argv[1]
    print "Loading embeddings for year", year
    words = util.load_pickle(
        "/dfs/scratch0/google_ngrams/info/interestingwords.pkl")
    base_embed = Explicit.load(INPUT_PATH.format(year=year),
                               restricted_context=words)
    print "SVD for year", year
    pca = TruncatedSVD(n_components=300)
    new_mat = pca.fit_transform(base_embed.m)
    print "Saving year", year
    np.save(OUTPUT_PATH.format(year=year) + ".npy", new_mat)
    vocab_outfp = open(OUTPUT_PATH.format(year=year) + ".vocab", "w")
    words = [word.encode('utf-8') for word in base_embed.iw]
    vocab_outfp.write(" ".join(words))