Beispiel #1
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py <counts>
    """)

    counts_path = args['<counts>']

    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            if line.__contains__('\t'):
                count, word, context = line.strip('\n').split('\t')
            else:
                count, word, context = line.strip().split()
            count = int(count.strip())
            words[word] += count
            contexts[context] += count

    words = sorted(words.items(), key=lambda wfreq: wfreq[1], reverse=True)
    contexts = sorted(contexts.items(),
                      key=lambda wfreq: wfreq[1],
                      reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
Beispiel #2
0
def main():
    args = docopt("""
    Usage:
        corpus2vocab.py [options] <corpus> <output>
    
    Options:
        --ngram NUM              Vocabulary includes grams of 1st to nth order [default: 1]
        --memory_size NUM        Memory size available [default: 8.0]
        --min_count NUM          Ignore word below a threshold [default: 10]
        --max_length NUM         Ignore word whose length exceeds a threshold [default: 50]
    """)

    print ("*********************************")
    print ("corpus2vocab")
    ngram = int(args['--ngram'])
    memory_size = float(args['--memory_size']) / 2 * 1000**3 #memory size is divided by 2 since we have to read both word and context vocabulary into memory in pairs2vocab step
    min_count = int(args['--min_count'])
    max_length = int(args['--max_length'])
    vocab = {} # vocabulary (stored by dictionary)
    reduce_thr = 1 # remove low-frequency words when memory is insufficient
    memory_size_used = 0 # size of memory used by keys & values in dictionary (not include dictionary itself) 

    with open(args['<corpus>']) as f:
        tokens_num = 0
        for line in f:
            sys.stdout.write("\r" + str(int(tokens_num/1000**2)) + "M tokens processed.")
            sys.stdout.flush()
            tokens = line.strip().split()
            tokens_num += len(tokens)
            for pos in range(len(tokens)):            
                for gram in range(1, ngram+1):
                    token = getNgram(tokens, pos, gram)
                    if token is None :
                        continue
                    if len(token) > max_length:
                        continue
                    if token not in vocab :
                        memory_size_used += getsizeof(token)
                        vocab[token] = 1
                        if memory_size_used + getsizeof(vocab) > memory_size * 0.8: #reduce vocabulary when memory is insufficient
                            reduce_thr += 1
                            vocab_size = len(vocab)
                            vocab = {w: c for w, c in six.iteritems(vocab) if c >= reduce_thr}
                            memory_size_used *= float(len(vocab)) / vocab_size #estimate the size of memory used
                    else:
                        vocab[token] += 1

    vocab = {w: c for w, c in six.iteritems(vocab) if c >= min_count} #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3
    vocab = sorted(six.iteritems(vocab), key=lambda item: item[1], reverse=True) #sort vocabulary by frequency in descending order
    save_count_vocabulary(args['<output>'], vocab)
    print ("number of tokens: " + str(tokens_num))
    print ("vocab size: " + str(len(vocab)))
    print ("low-frequency threshold: " + str(min_count if min_count > reduce_thr else reduce_thr))
    print ("corpus2vocab finished")
Beispiel #3
0
def main():
    args = docopt("""
    Usage:
        corpus2vocab.py [options] <corpus> <output>
    
    Options:
        --ngram NUM              Vocabulary includes grams of 1st to nth order [default: 1]
        --min_count NUM          Ignore words below a threshold [default: 10]
    """)

    print("**********************")
    print("corpus2vocab")
    ngram = int(args['--ngram'])
    min_count = int(args['--min_count'])
    vocab = {}  # vocabulary (stored by dictionary)

    with open(args['<corpus>']) as f:
        tokens_num = 0
        print(str(int(tokens_num / 1000**2)) + "M tokens processed.")
        for line in f:
            print("\x1b[1A" + str(int(tokens_num / 1000**2)) +
                  "M tokens processed.")  #ANSI
            tokens = line.strip().split()
            tokens_num += len(tokens)
            for pos in range(len(tokens)):
                for gram in range(1, ngram + 1):
                    token = getNgram(tokens, pos, gram)
                    if token is None:
                        continue
                    if token not in vocab:
                        vocab[token] = 1
                    else:
                        vocab[token] += 1

    vocab = {
        w: c
        for w, c in six.iteritems(vocab) if c >= min_count
    }  #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3
    vocab = sorted(
        six.iteritems(vocab), key=lambda item: item[1],
        reverse=True)  #sort vocabulary by frequency in descending order
    save_count_vocabulary(args['<output>'], vocab)
    print("number of tokens: " + str(tokens_num))
    print("vocab size: " + str(len(vocab)))
    print("corpus2vocab finished")
    def _counts2Vocab(self):
        counts_path = self.count_pair_file
        words = Counter()
        contexts = Counter()
        with open(counts_path) as f:
            for line in f:
                count, word, context = line.strip().split()
                count = int(count)
                words[word] += count
                contexts[context] += count

        words_items = sorted(words.items(), key=lambda x: x[1], reverse=True)
        contexts_items = sorted(contexts.items(), key=lambda x: x[1], reverse=True)

        save_count_vocabulary(counts_path + '.words.vocab', words_items)
        save_count_vocabulary(counts_path + '.contexts.vocab', contexts_items)
        self.words = words
        self.contexts = contexts
def main():
    args = docopt("""
    Usage:
        pairs2vocab.py <pairs> <words> <contexts>
    """)

    print("**********************")
    print("pairs2vocab")
    words_path = args['<words>']
    contexts_path = args['<contexts>']

    words = {}  #center word vocabulary
    contexts = {}  #context vocabulary
    with open(args['<pairs>']) as f:
        pairs_num = 0
        for line in f:
            pairs_num += 1
            if pairs_num % 1000**2 == 0:
                sys.stdout.write("\r" + str(int(pairs_num / 1000**2)) +
                                 "M pairs processed.")
                sys.stdout.flush()
            pair = line.strip().split()
            if pair[0] not in words:
                words[pair[0]] = 1
            else:
                words[pair[0]] += 1
            if pair[1] not in contexts:
                contexts[pair[1]] = 1
            else:
                contexts[pair[1]] += 1

    words = sorted(six.iteritems(words),
                   key=lambda item: item[1],
                   reverse=True)
    contexts = sorted(six.iteritems(contexts),
                      key=lambda item: item[1],
                      reverse=True)

    save_count_vocabulary(words_path, words)
    save_count_vocabulary(contexts_path, contexts)
    print("words size: " + str(len(words)))
    print("contexts size: " + str(len(contexts)))
    print("number of pairs: " + str(pairs_num))
    print("pairs2vocab finished")
def worker(proc_num, queue, out_dir, in_dir):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break

        print proc_num, "pairs2vocab for year", year
        words_path = out_dir + str(year) + "-w.vocab"
        contexts_path = out_dir + str(year) + "-c.vocab"

        words = {}  #center word vocabulary
        contexts = {}  #context vocabulary

        print proc_num, "Processing pairs for year", year
        with open(in_dir + str(year) + ".txt") as f:
            pairs_num = 0
            for line in f:
                pairs_num += 1
                if pairs_num % 1000**2 == 0:
                    print str(int(pairs_num / 1000**2)) + "M pairs processed."
                pair = line.strip().split()
                if pair[0] not in words:
                    words[pair[0]] = 1
                else:
                    words[pair[0]] += 1
                if pair[1] not in contexts:
                    contexts[pair[1]] = 1
                else:
                    contexts[pair[1]] += 1

        words = sorted(six.iteritems(words),
                       key=lambda item: item[1],
                       reverse=True)
        contexts = sorted(six.iteritems(contexts),
                          key=lambda item: item[1],
                          reverse=True)

        save_count_vocabulary(words_path, words)
        save_count_vocabulary(contexts_path, contexts)
        print("words size: " + str(len(words)))
        print("contexts size: " + str(len(contexts)))
        print("number of pairs: " + str(pairs_num))
        print("pairs2vocab finished")
def counts2vocab(counts_path):
    """
    Usage:
        counts2pmi.py <counts>
    """

    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            word, context, count = line.strip().split()
            count = int(count)
            words[word] += count
            contexts[context] += count

    words = sorted(words.items(), key=lambda (x, y): y, reverse=True)
    contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True)
    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
Beispiel #8
0
def counts2vocab(counts_path):
    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            try:
                count, word, context = line.strip().split()
                count = int(count)
                words[word] += count
                contexts[context] += count
            except ValueError as er:
                print(line.strip())
                raise er

    words = sorted(words.items(), key=lambda (x, y): y, reverse=True)
    contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
Beispiel #9
0
def main():
    args = docopt("""
    Usage:
        pairs2vocab.py <pairs> <words> <contexts>
    """)

    print "**********************"
    print "pairs2vocab"
    words_path = args['<words>']
    contexts_path = args['<contexts>']

    words = {}  #center word vocabulary
    contexts = {}  #context vocabulary
    with open(args['<pairs>']) as f:
        pairs_num = 0
        print str(pairs_num / 1000**2) + "M pairs processed."
        for line in f:
            pairs_num += 1
            if pairs_num % 1000**2 == 0:
                print "\x1b[1A" + str(
                    pairs_num / 1000**2) + "M pairs processed."
            pair = line.strip().split()
            if pair[0] not in words:
                words[pair[0]] = 1
            else:
                words[pair[0]] += 1
            if pair[1] not in contexts:
                contexts[pair[1]] = 1
            else:
                contexts[pair[1]] += 1

    words = sorted(words.iteritems(), key=lambda item: item[1], reverse=True)
    contexts = sorted(contexts.iteritems(),
                      key=lambda item: item[1],
                      reverse=True)

    save_count_vocabulary(words_path, words)
    save_count_vocabulary(contexts_path, contexts)
    print "words size: " + str(len(words))
    print "contexts size: " + str(len(contexts))
    print "number of pairs: " + str(pairs_num)
    print "pairs2vocab finished"
Beispiel #10
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py <counts>
    """)

    counts_path = args['<counts>']

    words = Counter()
    contexts = Counter()
    relations = Counter()
    with gzip.open(counts_path) as f:
        for line in f:
            split = line.decode('utf-8').strip().split()
            if len(split) == 4:
                count, word, context, relation = split
            else:
                count, word, context = split
                relation = None
            count = int(count)
            words[word] += count
            contexts[context] += count
            relations[relation] += count

    words = sorted(list(words.items()), key=lambda x_y: x_y[1], reverse=True)
    contexts = sorted(list(contexts.items()),
                      key=lambda x_y1: x_y1[1],
                      reverse=True)
    relations = sorted(list(relations.items()),
                       key=lambda x_y2: x_y2[1],
                       reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
    save_count_vocabulary(counts_path + '.relations.vocab', relations)
Beispiel #11
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py <counts>
    """)
    
    counts_path = args['<counts>']

    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            count, word, context = line.strip().split()
            count = int(count)
            words[word] += count
            contexts[context] += count

    words = sorted(words.items(), key=takeSecond, reverse=True)
    contexts = sorted(contexts.items(), key=takeSecond, reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
Beispiel #12
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py <counts>
    """)
    
    counts_path = args['<counts>']

    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            count, word, context = line.strip().split()
            count = int(count)
            words[word] += count
            contexts[context] += count

    words = sorted(words.items(), key=lambda (x, y): y, reverse=True)
    contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
Beispiel #13
0
def main():
    args = docopt("""
    Usage:
        pairs2vocab.py <pairs> <words> <contexts>
    """)
    
    print ("**********************")
    print ("pairs2vocab")
    words_path = args['<words>']
    contexts_path = args['<contexts>']

    words = {} #center word vocabulary
    contexts = {} #context vocabulary
    with open(args['<pairs>']) as f:
        pairs_num = 0
        for line in f:
            pairs_num += 1
            if pairs_num % 1000**2 == 0:
                sys.stdout.write("\r" + str(int(pairs_num/1000**2)) + "M pairs processed.")
                sys.stdout.flush()
            pair = line.strip().split()
            if pair[0] not in words :
                words[pair[0]] = 1
            else:
                words[pair[0]] += 1      
            if pair[1] not in contexts :
                contexts[pair[1]] = 1
            else:
                contexts[pair[1]] += 1

    words = sorted(six.iteritems(words), key=lambda item: item[1], reverse=True)
    contexts = sorted(six.iteritems(contexts), key=lambda item: item[1], reverse=True)

    save_count_vocabulary(words_path, words)
    save_count_vocabulary(contexts_path, contexts)   
    print ("words size: " + str(len(words)))
    print ("contexts size: " + str(len(contexts)))
    print ("number of pairs: " + str(pairs_num))
    print ("pairs2vocab finished")
Beispiel #14
0
def main():
    args = docopt("""
    Usage:
        corpus2vocab.py [options] <corpus> <output>
    
    Options:
        --ngram NUM              Vocabulary includes grams of 1st to nth order [default: 1]
        --min_count NUM          Ignore words below a threshold [default: 10]
    """)

    print ("**********************")
    print ("corpus2vocab")
    ngram = int(args['--ngram'])
    min_count = int(args['--min_count'])
    vocab = {} # vocabulary (stored by dictionary)

    with open(args['<corpus>']) as f:
        tokens_num = 0
        for line in f:
            sys.stdout.write("\r" + str(int(tokens_num/1000**2)) + "M tokens processed.") #ANSI
            tokens = line.strip().split()
            tokens_num += len(tokens)
            for pos in range(len(tokens)):            
                for gram in range(1, ngram+1):
                    token = getNgram(tokens, pos, gram)
                    if token is None :
                        continue
                    if token not in vocab :
                        vocab[token] = 1
                    else:
                        vocab[token] += 1

    vocab = {w: c for w, c in six.iteritems(vocab) if c >= min_count} #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3
    vocab = sorted(six.iteritems(vocab), key=lambda item: item[1], reverse=True) #sort vocabulary by frequency in descending order
    save_count_vocabulary(args['<output>'], vocab)
    print ("number of tokens: " + str(tokens_num))
    print ("vocab size: " + str(len(vocab)))
    print ("corpus2vocab finished")
Beispiel #15
0
def main():
    args = docopt("""
    Usage:
        corpus2vocab.py [options] <corpus> <output>
    
    Options:
        --ngram NUM              Vocabulary includes grams of 1st to nth order [default: 1]
        --memory_size NUM        Memory size available [default: 8.0]
        --min_count NUM          Ignore word below a threshold [default: 10]
        --max_length NUM         Ignore word whose length exceeds a threshold [default: 50]
    """)

    print("*********************************")
    print("corpus2vocab")
    ngram = int(args['--ngram'])
    memory_size = float(
        args['--memory_size']
    ) / 2 * 1000**3  #memory size is divided by 2 since we have to read both word and context vocabulary into memory in pairs2vocab step
    min_count = int(args['--min_count'])
    max_length = int(args['--max_length'])
    vocab = {}  # vocabulary (stored by dictionary)
    reduce_thr = 1  # remove low-frequency words when memory is insufficient
    memory_size_used = 0  # size of memory used by keys & values in dictionary (not include dictionary itself)

    with open(args['<corpus>']) as f:
        tokens_num = 0
        for line in f:
            sys.stdout.write("\r" + str(int(tokens_num / 1000**2)) +
                             "M tokens processed.")
            sys.stdout.flush()
            tokens = line.strip().split()
            tokens_num += len(tokens)
            for pos in range(len(tokens)):
                for gram in range(1, ngram + 1):
                    token = getNgram(tokens, pos, gram)
                    if token is None:
                        continue
                    if len(token) > max_length:
                        continue
                    if token not in vocab:
                        memory_size_used += getsizeof(token)
                        vocab[token] = 1
                        if memory_size_used + getsizeof(
                                vocab
                        ) > memory_size * 0.8:  #reduce vocabulary when memory is insufficient
                            reduce_thr += 1
                            vocab_size = len(vocab)
                            vocab = {
                                w: c
                                for w, c in six.iteritems(vocab)
                                if c >= reduce_thr
                            }
                            memory_size_used *= float(
                                len(vocab)
                            ) / vocab_size  #estimate the size of memory used
                    else:
                        vocab[token] += 1

    vocab = {
        w: c
        for w, c in six.iteritems(vocab) if c >= min_count
    }  #remove low-frequency words by pre-specified threshold, using six for bridging the gap between python 2 and 3
    vocab = sorted(
        six.iteritems(vocab), key=lambda item: item[1],
        reverse=True)  #sort vocabulary by frequency in descending order
    save_count_vocabulary(args['<output>'], vocab)
    print("number of tokens: " + str(tokens_num))
    print("vocab size: " + str(len(vocab)))
    print("low-frequency threshold: " +
          str(min_count if min_count > reduce_thr else reduce_thr))
    print("corpus2vocab finished")