def perform_pca(embedding_file, pca_dim): freqs, words, _, _, _, A = read_embeddings(embedding_file) say('performing PCA to reduce dimensions from {} to {}'.format( A.shape[1], pca_dim)) pca_trans, _, _ = pca_svd(A) A_pca = pca_trans[:, :pca_dim] write_embeddings(freqs, words, A_pca, embedding_file + '.pca' + str(pca_dim))
def rec(self, string, newline=True): if newline: print >> self.logf, string self.logf.flush() else: print string, self.logf.flush() say(string)
def write_U(self): say('Storing row-normalized U at: %s' % self.dirname+'/Ur') sorted_indices = [pair[0] for pair in sorted([(i, self.countX[i]) for i in self.wordmap], key=lambda x:x[1], reverse=True)] with open(self.dirname+'/Ur', 'wb') as f: for i in sorted_indices: write_row(f, self.countX[i], self.wordmap[i], self.U[i,:])
def decide_vocab(unigrams, cutoff, vocab_size, want): assert (unigrams and os.path.isfile(unigrams)) assert ((not (cutoff is None and vocab_size is None)) and (cutoff is None or vocab_size is None)) say('Reading unigrams') vocab = {} num_words = 0 total_sum = 0. mysum = 0. wantname = '' if want: wanted_words = {} lines = open(want).readlines() for line in lines: toks = line.split() if len(toks) == 0: continue wanted_words[toks[0]] = True wantname = '.' + os.path.splitext(os.path.basename(want))[0] num_wanted = 0 with open(unigrams) as f: for line in f: num_words += 1 toks = line.split() if len(toks) != 2: continue word = toks[0] count = int(toks[1]) total_sum += count if ((cutoff is not None) and (count <= cutoff)) or ((vocab_size is not None) and len(vocab) == vocab_size): if not (want and word in wanted_words): continue vocab[word] = count mysum += count if want and word in wanted_words: num_wanted += 1 if cutoff is not None: say('Cutoff %i: keep %i out of %i words (%5.2f%% unigram mass)' % (cutoff, len(vocab), num_words, mysum / total_sum * 100)) outfname = os.path.splitext(unigrams)[0] + '.cutoff' + str( cutoff) + wantname if vocab_size is not None: say('Vocab %i: keep %i out of %i words (%5.2f%% unigram mass)' % (vocab_size, len(vocab), num_words, mysum / total_sum * 100)) outfname = os.path.splitext(unigrams)[0] + '.vocab' + str( vocab_size) + wantname if want: say(' - Have %i out of %i wanted words' % (num_wanted, len(wanted_words))) return vocab, outfname
def count_ngrams(corpus, n_vals=False): assert (os.path.isfile(corpus)) if n_vals == False: answer = raw_input('Type in the values of n (e.g., \"1 3\"): ') n_vals = [int(n) for n in answer.split()] num_tok = 0 ngrams = [Counter() for n in n_vals] queues = [deque([_buffer_ for _ in range(n - 1)], n) for n in n_vals] with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) for i in range(len(n_vals)): queues[i].append(tok) ngrams[i][tuple(queues[i])] += 1 for i in range(len(n_vals)): for _ in range(n_vals[i] - 1): queues[i].append(_buffer_) ngrams[i][tuple(queues[i])] += 1 say('\nTotal {} tokens'.format(num_tok)) files = [ os.path.splitext(corpus)[0] + '.' + str(n) + 'grams' for n in n_vals ] for i in range(len(n_vals)): say('Sorting {} {}grams and writing to: {}'.format( len(ngrams[i]), n_vals[i], files[i])) sorted_ngrams = sorted(ngrams[i].items(), key=lambda x: x[1], reverse=True) with open(files[i], 'wb') as outf: for ngram, count in sorted_ngrams: for tok in ngram: print >> outf, tok, print >> outf, count
def call_matlab(stat, m, kappa): assert(m is not None and kappa is not None) outdirname = 'output/{}.m{}.kappa{}.matlab.out'.format(complete_path(stat)[:-1].rsplit('/',1)[1] , m, kappa) if not os.path.exists(outdirname): os.makedirs(outdirname) commandstr = matlab + ' -nojvm -nodisplay -nosplash -r ' + '\"approx_cca(\'' + stat + '\',' + str(m) + ',' + str(kappa) + ',\'' + outdirname + '\')\"' os.system(commandstr) say('Postprocessing to sort rows by frequency...') wordmap = read_wordmap(os.path.join(stat, 'wordmap')) freqmap = read_freqmap(os.path.join(stat, 'X')) sorted_indices = [pair[0] for pair in sorted([(i, freqmap[i]) for i in wordmap], key=lambda x:x[1], reverse=True)] lines = open(os.path.join(outdirname, 'Ur')).readlines() with open(os.path.join(outdirname, 'Ur'), 'wb') as outf: for i in sorted_indices: write_row(outf, freqmap[i], wordmap[i], lines[i].split()) return outdirname
def count_ngrams(corpus, n_vals=False): assert(os.path.isfile(corpus)) if n_vals == False: answer = raw_input('Type in the values of n (e.g., \"1 3\"): ') n_vals = [int(n) for n in answer.split()] num_tok = 0 ngrams = [Counter() for n in n_vals] queues = [deque([_buffer_ for _ in range(n-1)], n) for n in n_vals] with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) for i in range(len(n_vals)): queues[i].append(tok) ngrams[i][tuple(queues[i])] += 1 for i in range(len(n_vals)): for _ in range(n_vals[i]-1): queues[i].append(_buffer_) ngrams[i][tuple(queues[i])] += 1 say('\nTotal {} tokens'.format(num_tok)) files = [os.path.splitext(corpus)[0]+'.'+str(n)+'grams' for n in n_vals] for i in range(len(n_vals)): say('Sorting {} {}grams and writing to: {}'.format(len(ngrams[i]), n_vals[i], files[i])) sorted_ngrams = sorted(ngrams[i].items(), key=lambda x: x[1], reverse=True) with open(files[i], 'wb') as outf: for ngram, count in sorted_ngrams: for tok in ngram: print >> outf, tok, print >> outf, count
def decide_vocab(unigrams, cutoff, vocab_size, want): assert(unigrams and os.path.isfile(unigrams)) assert((not (cutoff is None and vocab_size is None)) and (cutoff is None or vocab_size is None)) say('Reading unigrams') vocab = {} num_words = 0 total_sum = 0. mysum = 0. wantname = '' if want: wanted_words = {} lines = open(want).readlines() for line in lines: toks = line.split() if len(toks) == 0: continue wanted_words[toks[0]] = True wantname = '.' + os.path.splitext(os.path.basename(want))[0] num_wanted = 0 with open(unigrams) as f: for line in f: num_words += 1 toks = line.split() if len(toks) != 2: continue word = toks[0] count = int(toks[1]) total_sum += count if ((cutoff is not None) and (count <= cutoff)) or ((vocab_size is not None) and len(vocab) == vocab_size): if not (want and word in wanted_words): continue vocab[word] = count mysum += count if want and word in wanted_words: num_wanted += 1 if cutoff is not None: say('Cutoff %i: keep %i out of %i words (%5.2f%% unigram mass)' % (cutoff, len(vocab), num_words, mysum/total_sum*100)) outfname = os.path.splitext(unigrams)[0] + '.cutoff' + str(cutoff) + wantname if vocab_size is not None: say('Vocab %i: keep %i out of %i words (%5.2f%% unigram mass)' % (vocab_size, len(vocab), num_words, mysum/total_sum*100)) outfname = os.path.splitext(unigrams)[0] + '.vocab' + str(vocab_size) + wantname if want: say(' - Have %i out of %i wanted words' %(num_wanted, len(wanted_words))) return vocab, outfname
def __init__(self, n_d, words, embs=None, fix_emb=True, bos='<s>', eos='</s>', oov='<oov>', pad='<pad>'): ''' Note: initialization of the extra tokens: [ bos, eos, oov, pad ] ''' word2id = {} if embs is not None: embwords, embvecs = embs for word in embwords: assert word not in word2id, "Duplicate words in pre-trained embeddings" word2id[word] = len(word2id) say("{} pre-trained word embeddings loaded.\n".format(len(word2id))) if n_d != len(embvecs[0]): say("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.\n".format( n_d, len(embvecs[0]), len(embvecs[0]) )) n_d = len(embvecs[0]) # if not fix_emb: for word in words: if word not in word2id: word2id[word] = len(word2id) extra_tokens = [bos, eos, oov, pad] for tok in extra_tokens: if tok not in word2id: word2id[tok] = len(word2id) say("{} embedded words in total\n".format(len(word2id))) self.word2id = word2id self.n_V, self.n_d = len(word2id), n_d self.oovid = word2id[oov] self.padid = word2id[pad] self.embedding = nn.Embedding(self.n_V, n_d) if embs is not None: weight = self.embedding.weight weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs)) say("embedding shape: {}\n".format(weight.size())) if fix_emb: self.embedding.weight.requires_grad = False
def __init__(self, n_d, words, embs=None, fix_emb=True, bos='<s>', eos='</s>', oov='<oov>', pad='<pad>'): ''' if not fix_emb: use the words that only appears in training data ''' word2id = {} if embs is not None: embwords, embvecs = embs if fix_emb: # only fill word2id with words in pre-trained embeddings when fix_emb for word in embwords: word2id[word] = len(word2id) say("{} pre-trained word embeddings loaded.\n".format(len(embwords))) if n_d != len(embvecs[0]): say("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.\n".format( n_d, len(embvecs[0]), len(embvecs[0]) )) n_d = len(embvecs[0]) if words is not None: # fix_emb should be False assert (not fix_emb) for word in words: if word not in word2id: word2id[word] = len(word2id) extra_tokens = [bos, eos, oov, pad] for tok in extra_tokens: if tok not in word2id: word2id[tok] = len(word2id) say("{} embedded words in total\n".format(len(word2id))) self.word2id = word2id self.n_V, self.n_d = len(word2id), n_d self.oovid = word2id[oov] self.padid = word2id[pad] self.embedding = nn.Embedding(self.n_V, n_d) self.embedding.weight.data.uniform_(-0.25, 0.25) self.embedding.weight.data[self.padid].zero_() self.embedding.weight.data[self.oovid].zero_() if embs is not None: weight = self.embedding.weight tor_embvecs = torch.from_numpy(embvecs) if fix_emb: weight.data[:len(embwords)].copy_(tor_embvecs) else: for word, wid in word2id.items(): if word in embwords: # initialize with pre-trained word embeddings weight.data[wid].copy_(tor_embvecs[embwords.index(word)]) if fix_emb: self.embedding.weight.requires_grad = False
def set_params(self, m, kappa): say('m: {}'.format(m)) say('kappa: {}'.format(kappa)) self.m = m self.kappa = kappa
def write_sv(self): say('\nStoring singular values at: %s' % self.dirname+'/sv') with open(self.dirname+'/sv', 'wb') as outf: for i in range(len(self.sv)): print >> outf, self.sv[i]
def extract_stat(corpus, vocab, stat, window): stat += '.window' + str(window) assert(os.path.isfile(corpus)) XYcount = Counter() Xcount = Counter() Ycount = Counter() def inc_stats(q): center = (window - 1) / 2 # position of the current token if q[center] == _buffer_: return token = q[center] if q[center] in vocab else _rare_ Xcount[token] += 1 for i in range(window): if i != center: if q[i] == _buffer_: continue friend = q[i] if q[i] in vocab else _rare_ rel_position = i-center position_marker = '<+'+str(rel_position)+'>' if rel_position > 0 else '<'+str(rel_position)+'>' friend += position_marker XYcount[(token, friend)] += 1 Ycount[friend] += 1 num_tok = 0 q = deque([_buffer_ for _ in range(window-1)], window) with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) q.append(tok) inc_stats(q) inline_print('\n') for _ in range(window-1): q.append(_buffer_) inc_stats(q) say('Creating directory {}'.format(stat)) if not os.path.exists(stat): os.makedirs(stat) xi, yi = {}, {} xhead, yhead = 1, 1 # starting from 1 for matlab with open(stat + '/X', 'wb') as Xfile: for token in Xcount: if not token in xi: xi[token] = xhead; xhead += 1 print >> Xfile, xi[token], Xcount[token] with open(stat + '/wordmap', 'wb') as wordmapfile: for token in xi: print >> wordmapfile, xi[token], token with open(stat + '/Y', 'wb') as Yfile: for friend in Ycount: if not friend in yi: yi[friend] = yhead; yhead += 1 print >> Yfile, yi[friend], Ycount[friend] with open(stat + '/XY', 'wb') as XYfile: for (token, friend) in XYcount: print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)] return XYcount, Xcount, Ycount, stat
def get_stat(self, stat): self.stat = complete_path(stat) XYstats = self.stat + 'XY' Xstats = self.stat + 'X' Ystats = self.stat + 'Y' assert(os.path.isfile(XYstats) and os.path.isfile(Xstats) and os.path.isfile(Ystats)) say('XYstats: {}'.format(XYstats)) say('Xstats: {}'.format(Xstats)) say('Ystats: {}'.format(Ystats)) self.wordmap = {} wordmapf = self.stat + 'wordmap' with open(wordmapf) as f: for line in f: toks = line.split() self.wordmap[int(toks[0])-1] = toks[1] pickle_file = self.stat + 'pickle' if os.path.isfile(pickle_file): with open(pickle_file) as f: self.countXY, self.countX, self.countY, self.num_samples = \ cPickle.load(f) return self.countXY = Counter() self.countX = Counter() self.countY = Counter() self.num_samples = 0. num_lines = wc_l(XYstats) linenum = 0 with open(XYstats) as f: for line in f: linenum += 1 toks = line.split() x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2]) self.countXY[x, y] = count if linenum % 1000 is 0: inline_print('Processing line %i of %i' % (linenum, num_lines)) with open(Xstats) as f: for line in f: toks = line.split() x, count = int(toks[0])-1, int(toks[1]) self.countX[x] = count self.num_samples += count with open(Ystats) as f: for line in f: toks = line.split() y, count = int(toks[0])-1, int(toks[1]) self.countY[y] = count inline_print('\nConstructing matrices\n') self.countXY = csc_matrix((self.countXY.values(), zip(*self.countXY.keys())), shape=(len(self.countX), len(self.countY))) self.countX = array([self.countX[i] for i in range(len(self.countX))]) self.countY = array([self.countY[i] for i in range(len(self.countY))]) with open(pickle_file, 'wb') as outf: cPickle.dump((self.countXY, self.countX, self.countY, self.num_samples), outf, protocol=cPickle.HIGHEST_PROTOCOL)
def extract_stat(corpus, vocab, stat, window, hash_width = 32): stat += '.window' + str(window) + '.hashbits' + str(hash_width) assert(os.path.isfile(corpus)) XYcount = Counter() Xcount = Counter() Ycount = Counter() CollisionCount = defaultdict(set) def inc_stats(q): center = (window - 1) / 2 # position of the current token if q[center] == _buffer_: return token = q[center] if q[center] in vocab else _rare_ Xcount[token] += 1 friend = '' for i in range(window): if i != center: if q[i] != _buffer_: friend += q[i] if q[i] in vocab else _rare_ rel_pos = i - center pos_marker = ('<+'+str(rel_pos)+'>' if rel_pos > 0 else '<'+str(rel_pos)+'>') friend += pos_marker friend_hashv = fnv_hash(friend, hash_width) CollisionCount[friend_hashv].add(friend) XYcount[(token, friend_hashv)] += 1 Ycount[friend_hashv] += 1 num_tok = 0 q = deque([_buffer_ for _ in range(window-1)], window) with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) q.append(tok) inc_stats(q) inline_print('\n') for _ in range(window-1): q.append(_buffer_) inc_stats(q) collisions = 0 for key, value in CollisionCount.iteritems(): if len(value) > 1: collisions += len(value) say('Collisions: {}'.format(collisions)) say('Creating directory {}'.format(stat)) if not os.path.exists(stat): os.makedirs(stat) xi, yi = {}, {} xhead, yhead = 1, 1 # starting from 1 for matlab with open(stat + '/X', 'wb') as Xfile: for token in Xcount: if not token in xi: xi[token] = xhead; xhead += 1 print >> Xfile, xi[token], Xcount[token] with open(stat + '/wordmap', 'wb') as wordmapfile: for token in xi: print >> wordmapfile, xi[token], token with open(stat + '/Y', 'wb') as Yfile: for friend in Ycount: if not friend in yi: yi[friend] = yhead; yhead += 1 print >> Yfile, yi[friend], Ycount[friend] with open(stat + '/XY', 'wb') as XYfile: for (token, friend) in XYcount: print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)] return XYcount, Xcount, Ycount, stat
def count_unigrams(corpus): unigrams = os.path.splitext(corpus)[0] + '.1grams' if not os.path.isfile(unigrams): count_ngrams(corpus, n_vals=[1]) else: say('{} exists'.format(unigrams)) return unigrams
def perform_pca(embedding_file, pca_dim): freqs, words, _, _, _, A = read_embeddings(embedding_file) say('performing PCA to reduce dimensions from {} to {}'.format(A.shape[1], pca_dim)) pca_trans, _, _ = pca_svd(A) A_pca = pca_trans[:,:pca_dim] write_embeddings(freqs, words, A_pca, embedding_file + '.pca' + str(pca_dim))
def extract_stat(corpus, vocab, stat, window): stat += '.window' + str(window) assert (os.path.isfile(corpus)) XYcount = Counter() Xcount = Counter() Ycount = Counter() def inc_stats(q): center = (window - 1) / 2 # position of the current token if q[center] == _buffer_: return token = q[center] if q[center] in vocab else _rare_ Xcount[token] += 1 for i in range(window): if i != center: if q[i] == _buffer_: continue friend = q[i] if q[i] in vocab else _rare_ rel_position = i - center position_marker = '<+' + str( rel_position) + '>' if rel_position > 0 else '<' + str( rel_position) + '>' friend += position_marker XYcount[(token, friend)] += 1 Ycount[friend] += 1 num_tok = 0 q = deque([_buffer_ for _ in range(window - 1)], window) with open(corpus) as f: while True: lines = f.readlines(10000000) # caching lines if not lines: break for line in lines: toks = line.split() for tok in toks: num_tok += 1 if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok)) q.append(tok) inc_stats(q) inline_print('\n') for _ in range(window - 1): q.append(_buffer_) inc_stats(q) say('Creating directory {}'.format(stat)) if not os.path.exists(stat): os.makedirs(stat) xi, yi = {}, {} xhead, yhead = 1, 1 # starting from 1 for matlab with open(stat + '/X', 'wb') as Xfile: for token in Xcount: if not token in xi: xi[token] = xhead xhead += 1 print >> Xfile, xi[token], Xcount[token] with open(stat + '/wordmap', 'wb') as wordmapfile: for token in xi: print >> wordmapfile, xi[token], token with open(stat + '/Y', 'wb') as Yfile: for friend in Ycount: if not friend in yi: yi[friend] = yhead yhead += 1 print >> Yfile, yi[friend], Ycount[friend] with open(stat + '/XY', 'wb') as XYfile: for (token, friend) in XYcount: print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)] return XYcount, Xcount, Ycount, stat