# Read corpus and extract key phrases. def worker(text): doc = nlp(text) phrases = [p.text for p in doc._.phrases] num_words = len(doc) return phrases, num_words p = Pool(options.nproc) total_words = 0 vocab = collections.Counter() for phrases, num_words in tqdm(p.imap(worker, corpus)): # Note: This count include punctuation as well as words. total_words += num_words # examine the top-ranked phrases in the document seen = 0 for i, p in enumerate(phrases): if len(p.split()) == 1: continue # print("{:.3f} {}".format(p.rank, p.text)) vocab[p] += 1 seen += 1 if options.maxphrases_per_doc > 0 and seen == options.maxphrases_per_doc: break for k in sorted(vocab.keys()): print('{} {}'.format(k, vocab[k])) print('corpus-size={} total-words={} total-vocab={}'.format( len(corpus), total_words, len(vocab)))