def main(): args = docopt(""" Usage: counts2pmi.py <counts> """) counts_path = args['<counts>'] words = Counter() contexts = Counter() with open(counts_path) as f: for line in f: if line.__contains__('\t'): count, word, context = line.strip('\n').split('\t') else: count, word, context = line.strip().split() count = int(count.strip()) words[word] += count contexts[context] += count words = sorted(words.items(), key=lambda wfreq: wfreq[1], reverse=True) contexts = sorted(contexts.items(), key=lambda wfreq: wfreq[1], reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words) save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
def main(): args = docopt(""" Usage: corpus2vocab.py [options] <corpus> <output> Options: --ngram NUM Vocabulary includes grams of 1st to nth order [default: 1] --memory_size NUM Memory size available [default: 8.0] --min_count NUM Ignore word below a threshold [default: 10] --max_length NUM Ignore word whose length exceeds a threshold [default: 50] """) print ("*********************************") print ("corpus2vocab") ngram = int(args['--ngram']) memory_size = float(args['--memory_size']) / 2 * 1000**3 #memory size is divided by 2 since we have to read both word and context vocabulary into memory in pairs2vocab step min_count = int(args['--min_count']) max_length = int(args['--max_length']) vocab = {} # vocabulary (stored by dictionary) reduce_thr = 1 # remove low-frequency words when memory is insufficient memory_size_used = 0 # size of memory used by keys & values in dictionary (not include dictionary itself) with open(args['<corpus>']) as f: tokens_num = 0 for line in f: sys.stdout.write("\r" + str(int(tokens_num/1000**2)) + "M tokens processed.") sys.stdout.flush() tokens = line.strip().split() tokens_num += len(tokens) for pos in range(len(tokens)): for gram in range(1, ngram+1): token = getNgram(tokens, pos, gram) if token is None : continue if len(token) > max_length: continue if token not in vocab : memory_size_used += getsizeof(token) vocab[token] = 1 if memory_size_used + getsizeof(vocab) > memory_size * 0.8: #reduce vocabulary when memory is insufficient reduce_thr += 1 vocab_size = len(vocab) vocab = {w: c for w, c in six.iteritems(vocab) if c >= reduce_thr} memory_size_used *= float(len(vocab)) / vocab_size #estimate the size of memory used else: vocab[token] += 1 vocab = {w: c for w, c in six.iteritems(vocab) if c >= min_count} #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3 vocab = sorted(six.iteritems(vocab), key=lambda item: item[1], reverse=True) #sort vocabulary by frequency in descending order save_count_vocabulary(args['<output>'], vocab) print ("number of tokens: " + str(tokens_num)) print ("vocab size: " + str(len(vocab))) print ("low-frequency threshold: " + str(min_count if min_count > reduce_thr else reduce_thr)) print ("corpus2vocab finished")
def main(): args = docopt(""" Usage: corpus2vocab.py [options] <corpus> <output> Options: --ngram NUM Vocabulary includes grams of 1st to nth order [default: 1] --min_count NUM Ignore words below a threshold [default: 10] """) print("**********************") print("corpus2vocab") ngram = int(args['--ngram']) min_count = int(args['--min_count']) vocab = {} # vocabulary (stored by dictionary) with open(args['<corpus>']) as f: tokens_num = 0 print(str(int(tokens_num / 1000**2)) + "M tokens processed.") for line in f: print("\x1b[1A" + str(int(tokens_num / 1000**2)) + "M tokens processed.") #ANSI tokens = line.strip().split() tokens_num += len(tokens) for pos in range(len(tokens)): for gram in range(1, ngram + 1): token = getNgram(tokens, pos, gram) if token is None: continue if token not in vocab: vocab[token] = 1 else: vocab[token] += 1 vocab = { w: c for w, c in six.iteritems(vocab) if c >= min_count } #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3 vocab = sorted( six.iteritems(vocab), key=lambda item: item[1], reverse=True) #sort vocabulary by frequency in descending order save_count_vocabulary(args['<output>'], vocab) print("number of tokens: " + str(tokens_num)) print("vocab size: " + str(len(vocab))) print("corpus2vocab finished")
def _counts2Vocab(self): counts_path = self.count_pair_file words = Counter() contexts = Counter() with open(counts_path) as f: for line in f: count, word, context = line.strip().split() count = int(count) words[word] += count contexts[context] += count words_items = sorted(words.items(), key=lambda x: x[1], reverse=True) contexts_items = sorted(contexts.items(), key=lambda x: x[1], reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words_items) save_count_vocabulary(counts_path + '.contexts.vocab', contexts_items) self.words = words self.contexts = contexts
def main(): args = docopt(""" Usage: pairs2vocab.py <pairs> <words> <contexts> """) print("**********************") print("pairs2vocab") words_path = args['<words>'] contexts_path = args['<contexts>'] words = {} #center word vocabulary contexts = {} #context vocabulary with open(args['<pairs>']) as f: pairs_num = 0 for line in f: pairs_num += 1 if pairs_num % 1000**2 == 0: sys.stdout.write("\r" + str(int(pairs_num / 1000**2)) + "M pairs processed.") sys.stdout.flush() pair = line.strip().split() if pair[0] not in words: words[pair[0]] = 1 else: words[pair[0]] += 1 if pair[1] not in contexts: contexts[pair[1]] = 1 else: contexts[pair[1]] += 1 words = sorted(six.iteritems(words), key=lambda item: item[1], reverse=True) contexts = sorted(six.iteritems(contexts), key=lambda item: item[1], reverse=True) save_count_vocabulary(words_path, words) save_count_vocabulary(contexts_path, contexts) print("words size: " + str(len(words))) print("contexts size: " + str(len(contexts))) print("number of pairs: " + str(pairs_num)) print("pairs2vocab finished")
def worker(proc_num, queue, out_dir, in_dir): while True: try: year = queue.get(block=False) except Empty: break print proc_num, "pairs2vocab for year", year words_path = out_dir + str(year) + "-w.vocab" contexts_path = out_dir + str(year) + "-c.vocab" words = {} #center word vocabulary contexts = {} #context vocabulary print proc_num, "Processing pairs for year", year with open(in_dir + str(year) + ".txt") as f: pairs_num = 0 for line in f: pairs_num += 1 if pairs_num % 1000**2 == 0: print str(int(pairs_num / 1000**2)) + "M pairs processed." pair = line.strip().split() if pair[0] not in words: words[pair[0]] = 1 else: words[pair[0]] += 1 if pair[1] not in contexts: contexts[pair[1]] = 1 else: contexts[pair[1]] += 1 words = sorted(six.iteritems(words), key=lambda item: item[1], reverse=True) contexts = sorted(six.iteritems(contexts), key=lambda item: item[1], reverse=True) save_count_vocabulary(words_path, words) save_count_vocabulary(contexts_path, contexts) print("words size: " + str(len(words))) print("contexts size: " + str(len(contexts))) print("number of pairs: " + str(pairs_num)) print("pairs2vocab finished")
def counts2vocab(counts_path): """ Usage: counts2pmi.py <counts> """ words = Counter() contexts = Counter() with open(counts_path) as f: for line in f: word, context, count = line.strip().split() count = int(count) words[word] += count contexts[context] += count words = sorted(words.items(), key=lambda (x, y): y, reverse=True) contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words) save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
def counts2vocab(counts_path): words = Counter() contexts = Counter() with open(counts_path) as f: for line in f: try: count, word, context = line.strip().split() count = int(count) words[word] += count contexts[context] += count except ValueError as er: print(line.strip()) raise er words = sorted(words.items(), key=lambda (x, y): y, reverse=True) contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words) save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
def main(): args = docopt(""" Usage: pairs2vocab.py <pairs> <words> <contexts> """) print "**********************" print "pairs2vocab" words_path = args['<words>'] contexts_path = args['<contexts>'] words = {} #center word vocabulary contexts = {} #context vocabulary with open(args['<pairs>']) as f: pairs_num = 0 print str(pairs_num / 1000**2) + "M pairs processed." for line in f: pairs_num += 1 if pairs_num % 1000**2 == 0: print "\x1b[1A" + str( pairs_num / 1000**2) + "M pairs processed." pair = line.strip().split() if pair[0] not in words: words[pair[0]] = 1 else: words[pair[0]] += 1 if pair[1] not in contexts: contexts[pair[1]] = 1 else: contexts[pair[1]] += 1 words = sorted(words.iteritems(), key=lambda item: item[1], reverse=True) contexts = sorted(contexts.iteritems(), key=lambda item: item[1], reverse=True) save_count_vocabulary(words_path, words) save_count_vocabulary(contexts_path, contexts) print "words size: " + str(len(words)) print "contexts size: " + str(len(contexts)) print "number of pairs: " + str(pairs_num) print "pairs2vocab finished"
def main(): args = docopt(""" Usage: counts2pmi.py <counts> """) counts_path = args['<counts>'] words = Counter() contexts = Counter() relations = Counter() with gzip.open(counts_path) as f: for line in f: split = line.decode('utf-8').strip().split() if len(split) == 4: count, word, context, relation = split else: count, word, context = split relation = None count = int(count) words[word] += count contexts[context] += count relations[relation] += count words = sorted(list(words.items()), key=lambda x_y: x_y[1], reverse=True) contexts = sorted(list(contexts.items()), key=lambda x_y1: x_y1[1], reverse=True) relations = sorted(list(relations.items()), key=lambda x_y2: x_y2[1], reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words) save_count_vocabulary(counts_path + '.contexts.vocab', contexts) save_count_vocabulary(counts_path + '.relations.vocab', relations)
def main(): args = docopt(""" Usage: counts2pmi.py <counts> """) counts_path = args['<counts>'] words = Counter() contexts = Counter() with open(counts_path) as f: for line in f: count, word, context = line.strip().split() count = int(count) words[word] += count contexts[context] += count words = sorted(words.items(), key=takeSecond, reverse=True) contexts = sorted(contexts.items(), key=takeSecond, reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words) save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
def main(): args = docopt(""" Usage: counts2pmi.py <counts> """) counts_path = args['<counts>'] words = Counter() contexts = Counter() with open(counts_path) as f: for line in f: count, word, context = line.strip().split() count = int(count) words[word] += count contexts[context] += count words = sorted(words.items(), key=lambda (x, y): y, reverse=True) contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True) save_count_vocabulary(counts_path + '.words.vocab', words) save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
def main(): args = docopt(""" Usage: pairs2vocab.py <pairs> <words> <contexts> """) print ("**********************") print ("pairs2vocab") words_path = args['<words>'] contexts_path = args['<contexts>'] words = {} #center word vocabulary contexts = {} #context vocabulary with open(args['<pairs>']) as f: pairs_num = 0 for line in f: pairs_num += 1 if pairs_num % 1000**2 == 0: sys.stdout.write("\r" + str(int(pairs_num/1000**2)) + "M pairs processed.") sys.stdout.flush() pair = line.strip().split() if pair[0] not in words : words[pair[0]] = 1 else: words[pair[0]] += 1 if pair[1] not in contexts : contexts[pair[1]] = 1 else: contexts[pair[1]] += 1 words = sorted(six.iteritems(words), key=lambda item: item[1], reverse=True) contexts = sorted(six.iteritems(contexts), key=lambda item: item[1], reverse=True) save_count_vocabulary(words_path, words) save_count_vocabulary(contexts_path, contexts) print ("words size: " + str(len(words))) print ("contexts size: " + str(len(contexts))) print ("number of pairs: " + str(pairs_num)) print ("pairs2vocab finished")
def main(): args = docopt(""" Usage: corpus2vocab.py [options] <corpus> <output> Options: --ngram NUM Vocabulary includes grams of 1st to nth order [default: 1] --min_count NUM Ignore words below a threshold [default: 10] """) print ("**********************") print ("corpus2vocab") ngram = int(args['--ngram']) min_count = int(args['--min_count']) vocab = {} # vocabulary (stored by dictionary) with open(args['<corpus>']) as f: tokens_num = 0 for line in f: sys.stdout.write("\r" + str(int(tokens_num/1000**2)) + "M tokens processed.") #ANSI tokens = line.strip().split() tokens_num += len(tokens) for pos in range(len(tokens)): for gram in range(1, ngram+1): token = getNgram(tokens, pos, gram) if token is None : continue if token not in vocab : vocab[token] = 1 else: vocab[token] += 1 vocab = {w: c for w, c in six.iteritems(vocab) if c >= min_count} #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3 vocab = sorted(six.iteritems(vocab), key=lambda item: item[1], reverse=True) #sort vocabulary by frequency in descending order save_count_vocabulary(args['<output>'], vocab) print ("number of tokens: " + str(tokens_num)) print ("vocab size: " + str(len(vocab))) print ("corpus2vocab finished")
def main(): args = docopt(""" Usage: corpus2vocab.py [options] <corpus> <output> Options: --ngram NUM Vocabulary includes grams of 1st to nth order [default: 1] --memory_size NUM Memory size available [default: 8.0] --min_count NUM Ignore word below a threshold [default: 10] --max_length NUM Ignore word whose length exceeds a threshold [default: 50] """) print("*********************************") print("corpus2vocab") ngram = int(args['--ngram']) memory_size = float( args['--memory_size'] ) / 2 * 1000**3 #memory size is divided by 2 since we have to read both word and context vocabulary into memory in pairs2vocab step min_count = int(args['--min_count']) max_length = int(args['--max_length']) vocab = {} # vocabulary (stored by dictionary) reduce_thr = 1 # remove low-frequency words when memory is insufficient memory_size_used = 0 # size of memory used by keys & values in dictionary (not include dictionary itself) with open(args['<corpus>']) as f: tokens_num = 0 for line in f: sys.stdout.write("\r" + str(int(tokens_num / 1000**2)) + "M tokens processed.") sys.stdout.flush() tokens = line.strip().split() tokens_num += len(tokens) for pos in range(len(tokens)): for gram in range(1, ngram + 1): token = getNgram(tokens, pos, gram) if token is None: continue if len(token) > max_length: continue if token not in vocab: memory_size_used += getsizeof(token) vocab[token] = 1 if memory_size_used + getsizeof( vocab ) > memory_size * 0.8: #reduce vocabulary when memory is insufficient reduce_thr += 1 vocab_size = len(vocab) vocab = { w: c for w, c in six.iteritems(vocab) if c >= reduce_thr } memory_size_used *= float( len(vocab) ) / vocab_size #estimate the size of memory used else: vocab[token] += 1 vocab = { w: c for w, c in six.iteritems(vocab) if c >= min_count } #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3 vocab = sorted( six.iteritems(vocab), key=lambda item: item[1], reverse=True) #sort vocabulary by frequency in descending order save_count_vocabulary(args['<output>'], vocab) print("number of tokens: " + str(tokens_num)) print("vocab size: " + str(len(vocab))) print("low-frequency threshold: " + str(min_count if min_count > reduce_thr else reduce_thr)) print("corpus2vocab finished")