def add_vocab(self, sentences): """ Merge the collected counts `vocab` into this phrase detector. """ # uses a separate vocab to collect the token counts from `sentences`. # this consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. min_reduce, vocab, total_words = self.learn_vocab( sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms) self.corpus_word_count += total_words if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in iteritems(vocab): self.vocab[word] += count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 logger.info("merged %s", self) else: # in common case, avoid doubling gigantic dict logger.info("using %i counts as vocab in %s", len(vocab), self) self.vocab = vocab
def add_vocab(self, sentences): """ Merge the collected counts `vocab` into this phrase detector. """ # uses a separate vocab to collect the token counts from `sentences`. # this consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in iteritems(vocab): self.vocab[word] += count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 logger.info("merged %s", self) else: # in common case, avoid doubling gigantic dict logger.info("using %i counts as vocab in %s", len(vocab), self) self.vocab = vocab
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 logger.info("collecting all words and their counts") vocab = defaultdict(int) min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence = [utils.any2utf8(w) for w in sentence] for bigram in zip(sentence, sentence[1:]): vocab[bigram[0]] += 1 vocab[delimiter.join(bigram)] += 1 total_words += 1 if sentence: # add last word skipped by previous loop word = sentence[-1] vocab[word] += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % (len(vocab), total_words, sentence_no + 1)) return min_reduce, vocab
def scan_vocab(self, documents, progress_per=10000, trim_rule=None): logger.info("collecting all words and their counts") document_no = -1 total_words = 0 min_reduce = 1 interval_start = default_timer() - 0.00001 # guard against next sample being identical interval_count = 0 vocab = defaultdict(int) for document_no, document in enumerate(documents): if document_no % progress_per == 0: interval_rate = (total_words - interval_count) / (default_timer() - interval_start) logger.info("PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", document_no, total_words, interval_rate, len(vocab), len(self.docvecs)) interval_start = default_timer() interval_count = total_words document_length = len(document.words) for tag in document.tags: self.docvecs.note_doctag(tag, document_no, document_length) for word in document.words: vocab[word] += 1 total_words += len(document.words) if self.max_vocab_size and len(vocab) > self.max_vocab_size: utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 logger.info("collected %i word types and %i unique tags from a corpus of %i examples and %i words", len(vocab), len(self.docvecs), document_no + 1, total_words) self.corpus_count = document_no + 1 self.raw_vocab = vocab
def learn_vocab(sentences, max_vocab_size, delimiter=b'_'): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 logger.info("collecting all words and their counts") vocab = defaultdict(int) min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % 10000 == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence = [utils.any2utf8(w) for w in sentence] for bigram in zip(sentence, sentence[1:]): vocab[bigram[0]] += 1 vocab[delimiter.join(bigram)] += 1 total_words += 1 if sentence: # add last word skipped by previous loop word = sentence[-1] vocab[word] += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % (len(vocab), total_words, sentence_no + 1)) return min_reduce, vocab
def _learn_vocab(sentences, max_vocab_size, delimiter, connector_words, progress_per): """Collect unigram and bigram counts from the `sentences` iterable.""" sentence_no, total_words, min_reduce = -1, 0, 1 vocab = defaultdict(int) logger.info("collecting all words and their counts") for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types", sentence_no, total_words, len(vocab), ) start_token, in_between = None, [] for word in sentence: if word not in connector_words: vocab[word] += 1 if start_token is not None: phrase_tokens = itertools.chain([start_token], in_between, [word]) vocab[delimiter.join(phrase_tokens)] += 1 start_token, in_between = word, [] # treat word as both end of a phrase AND beginning of another elif start_token is not None: in_between.append(word) total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info( "collected %i token types (unigram + bigrams) from a corpus of %i words and %i sentences", len(vocab), total_words, sentence_no + 1, ) return min_reduce, vocab, total_words
def add_vocab(self, sentences): """Update model parameters with new `sentences`. Parameters ---------- sentences : iterable of list of str Text corpus to update this model's parameters from. Example ------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS >>> >>> # Train a phrase detector from a text corpus. >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> phrases = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS) # train model >>> assert len(phrases.vocab) == 37 >>> >>> more_sentences = [ ... [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'], ... [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes'], ... ] >>> >>> phrases.add_vocab(more_sentences) # add new sentences to model >>> assert len(phrases.vocab) == 60 """ # Uses a separate vocab to collect the token counts from `sentences`. # This consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accumulated # counts collected in previous learn_vocab runs. min_reduce, vocab, total_words = self._learn_vocab( sentences, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, connector_words=self.connector_words, ) self.corpus_word_count += total_words if self.vocab: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in vocab.items(): self.vocab[word] = self.vocab.get(word, 0) + count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 else: # Optimization for a common case: the current vocab is empty, so apply # the new vocab directly, no need to double it in memory. self.vocab = vocab logger.info("merged %s", self)
def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False): logger.info("collecting all words and their counts") document_no = -1 total_words = 0 min_reduce = 1 interval_start = default_timer( ) - 0.00001 # guard against next sample being identical interval_count = 0 checked_string_types = 0 vocab = defaultdict(int) for document_no, document in enumerate(documents): if not checked_string_types: if isinstance(document.words, string_types): logger.warn( "Each 'words' should be a list of words (usually unicode strings)." "First 'words' here is instead plain %s." % type(document.words)) checked_string_types += 1 if document_no % progress_per == 0: interval_rate = (total_words - interval_count) / ( default_timer() - interval_start) logger.info( "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags, %i labels", document_no, total_words, interval_rate, len(vocab), len(self.docvecs), len(self.labelvecs)) interval_start = default_timer() interval_count = total_words document_length = len(document.words) for tag in document.tags: self.docvecs.note_doctag(tag, document_no, document_length) for label in document.labels: self.labelvecs.note_doctag(label, document_no, document_length) for word in document.words: vocab[word] += 1 total_words += len(document.words) if self.max_vocab_size and len(vocab) > self.max_vocab_size: utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 logger.info( "collected %i word types and %i unique tags and %i unique labels from a corpus of %i examples and %i words", len(vocab), len(self.docvecs), len(self.labelvecs), document_no + 1, total_words) self.corpus_count = document_no + 1 self.raw_vocab = vocab
def add_vocab(self, sentences): """Update model with new `sentences`. Parameters ---------- sentences : iterable of list of str Text corpus. Example ------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases >>> # Create corpus and use it for phrase detector >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> phrases = Phrases(sentences) # train model >>> assert len(phrases.vocab) == 37 >>> >>> more_sentences = [ ... [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'], ... [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes'] ... ] >>> >>> phrases.add_vocab(more_sentences) # add new sentences to model >>> assert len(phrases.vocab) == 60 """ # uses a separate vocab to collect the token counts from `sentences`. # this consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. min_reduce, vocab, total_words = self.learn_vocab( sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms) self.corpus_word_count += total_words if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in iteritems(vocab): self.vocab[word] += count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 logger.info("merged %s", self) else: # in common case, avoid doubling gigantic dict logger.info("using %i counts as vocab in %s", len(vocab), self) self.vocab = vocab
def add_vocab(self, sentences): """Update model with new `sentences`. Parameters ---------- sentences : iterable of list of str Text corpus. Example ------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases >>> # Create corpus and use it for phrase detector >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> phrases = Phrases(sentences) # train model >>> assert len(phrases.vocab) == 37 >>> >>> more_sentences = [ ... [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'], ... [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes'] ... ] >>> >>> phrases.add_vocab(more_sentences) # add new sentences to model >>> assert len(phrases.vocab) == 60 """ # uses a separate vocab to collect the token counts from `sentences`. # this consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accumulated # counts collected in previous learn_vocab runs. min_reduce, vocab, total_words = self.learn_vocab( sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms) self.corpus_word_count += total_words if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in iteritems(vocab): self.vocab[word] += count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 logger.info("merged %s", self) else: # in common case, avoid doubling gigantic dict logger.info("using %i counts as vocab in %s", len(vocab), self) self.vocab = vocab
def scan_vocab(self, sentences, progress_per=10000): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") sentence_no = -1 total_words = 0 min_reduce = 1 vocab = defaultdict(int) for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", sentence_no, sum(itervalues(vocab)) + total_words, len(vocab), ) # TODO THIS is a change that every word's has a topic for word, topic in sentence: # CHANGE vocab[word] += 1 if self.max_vocab_size and len(vocab) > self.max_vocab_size: total_words += utils.prune_vocab(vocab, min_reduce) min_reduce += 1 total_words += sum(itervalues(vocab)) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1, ) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab
def scan_vocab(self, sentences, progress_per=10000): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") sentence_no = -1 total_words = 0 min_reduce = 1 vocab = defaultdict(int) for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)) for word in sentence: vocab[word] += 1 if self.max_vocab_size and len(vocab) > self.max_vocab_size: total_words += utils.prune_vocab(vocab, min_reduce) min_reduce += 1 total_words += sum(itervalues(vocab)) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1) self.corpus_count = sentence_no + 1 self.raw_vocab = vocab
def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): logger.info("collecting all words and their counts") document_no = -1 total_words = 0 min_reduce = 1 interval_start = default_timer() - 0.00001 # guard against next sample being identical interval_count = 0 checked_string_types = 0 vocab = defaultdict(int) for document_no, document in enumerate(documents): if not checked_string_types: if isinstance(document.words, string_types): logger.warning( "Each 'words' should be a list of words (usually unicode strings). " "First 'words' here is instead plain %s.", type(document.words) ) checked_string_types += 1 if document_no % progress_per == 0: interval_rate = (total_words - interval_count) / (default_timer() - interval_start) logger.info( "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", document_no, total_words, interval_rate, len(vocab), docvecs.count ) interval_start = default_timer() interval_count = total_words document_length = len(document.words) for tag in document.tags: self.note_doctag(tag, document_no, document_length, docvecs) for word in document.words: vocab[word] += 1 total_words += len(document.words) if self.max_vocab_size and len(vocab) > self.max_vocab_size: utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 logger.info( "collected %i word types and %i unique tags from a corpus of %i examples and %i words", len(vocab), docvecs.count, document_no + 1, total_words ) corpus_count = document_no + 1 self.raw_vocab = vocab return total_words, corpus_count
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, common_terms=frozenset()): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 logger.info("collecting all words and their counts") vocab = defaultdict(int) min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types", sentence_no, total_words, len(vocab), ) s = [utils.any2utf8(w) for w in sentence] last_uncommon = None in_between = [] for word in s: if word not in common_terms: vocab[word] += 1 if last_uncommon is not None: components = it.chain([last_uncommon], in_between, [word]) vocab[delimiter.join(components)] += 1 last_uncommon = word in_between = [] elif last_uncommon is not None: in_between.append(word) total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info( "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences", len(vocab), total_words, sentence_no + 1) return min_reduce, vocab, total_words
def add_vocab(self, sentences): """ Merge the collected counts `vocab` into this phrase detector. """ # uses a separate vocab to collect the token counts from `sentences`. # this consumes more RAM than merging new sentences into `self.vocab` # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter) logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in iteritems(vocab): self.vocab[word] += count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 logger.info("merged %s", self)
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, common_terms=frozenset()): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 logger.info("collecting all words and their counts") vocab = defaultdict(int) min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types", sentence_no, total_words, len(vocab), ) s = [utils.any2utf8(w) for w in sentence] last_uncommon = None in_between = [] for word in s: if word not in common_terms: vocab[word] += 1 if last_uncommon is not None: components = it.chain([last_uncommon], in_between, [word]) vocab[delimiter.join(components)] += 1 last_uncommon = word in_between = [] elif last_uncommon is not None: in_between.append(word) total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info( "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences", len(vocab), total_words, sentence_no + 1 ) return min_reduce, vocab, total_words
def scan_vocab(self, train_pairs, progress_per=100): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") train_pair_no = -1 total_ptrees = 0 min_reduce = 1 vocab_l_raw = defaultdict(int) vocab_i = set() vocab_k = set() vocab_r = defaultdict(lambda: defaultdict(int)) for train_pair_no, (utter, myast) in enumerate(train_pairs): if train_pair_no % progress_per == 0: logger.info("PROGRESS: at train_pair #%i. Processed %i partial trees.", train_pair_no, total_ptrees) l_tokens = utter.split() for w in l_tokens: vocab_l_raw[w] += 1 if self.max_vocab_size and len(vocab_l_raw) > self.max_vocab_size: # Pruning only applies to NL queries gsutils.prune_vocab(vocab_l_raw, min_reduce) min_reduce += 1 for ptree in myast.randomized_partial_trees( self.random, self.memsize_k, self.memsize_i): terminals, _, parent, children = ptree terminals = map(lambda x: x.getSimple(), terminals) parent = parent.getSimple() if isinstance(children, (list, tuple)): children = tuple(map(lambda x: x.getSimple(), children)) else: children = children.getSimple() vocab_k |= set(terminals) vocab_i.add(parent) vocab_r[parent][children] += 1 total_ptrees += 1 self.corpus_count = train_pair_no + 1 # this is needed somewhere else self.raw_vocab_l = vocab_l_raw self.index2word_i = sorted(vocab_i) self.index2word_i.insert(0, None) # 0th position is for the padding token self.vocab_i = dict((y,x) for (x,y) in enumerate(self.index2word_i)) self.index2word_k = sorted(vocab_k) self.index2word_k.insert(0, None) # 0th position is for the padding token self.vocab_k = dict((y,x) for (x,y) in enumerate(self.index2word_k)) self.vocab_r = {} self.index2word_r = {} self.r_idx_offset = {} self.total_r_entries = 0 for parent in sorted(vocab_r.keys()): self.r_idx_offset[parent] = self.total_r_entries self.vocab_r[parent] = {} self.index2word_r[parent] = [] # vocab_r[parent] may contain a mixture of SimpleAstNodes and tuples. To # sort them properly, we need to sort them separately. single_childrens = sorted(x for x in vocab_r[parent].keys() if isinstance(x, SimpleAstNode)) tuple_childrens = sorted(x for x in vocab_r[parent].keys() if isinstance(x, tuple)) combined_sorted_children = single_childrens + tuple_childrens assert len(combined_sorted_children) == len(vocab_r[parent]) for children in combined_sorted_children: self.vocab_r[parent][children] = Vocab( count=vocab_r[parent][children], index=len(self.index2word_r[parent])) self.index2word_r[parent].append(children) self.total_r_entries += len(self.index2word_r[parent]) """ After this point, vocab_i and vocab_k are complete. The only things left are: - scale and finalize vocab_r (because of min_count, downsampling, sorting, etc.). - build a cum_table for negative sampling of vocab_r. """ logger.info("Collected %i NL word types", len(vocab_l_raw)) logger.info("Collected %i terminal types", len(self.vocab_k)) logger.info("Collected %i non-terminal types", len(self.vocab_i)) logger.info("Created %i production tables", len(self.vocab_r)) logger.info("Collected %i total production entries", self.total_r_entries)
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, common_terms=frozenset(), doc2vec=False): """Collect unigram/bigram counts from the `sentences` iterable. Parameters ---------- sentences : iterable of list of str The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` for such examples. max_vocab_size : int Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease `max_vocab_size` depending on how much available memory you have. delimiter : str, optional Glue character used to join collocation tokens, should be a byte string (e.g. b'_'). progress_per : int Write logs every `progress_per` sentence. common_terms : set of str, optional List of "stop words" that won't affect frequency count of expressions containing them. Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder". Return ------ (int, dict of (str, int), int) Number of pruned words, counters for each word/bi-gram and total number of words. Example ---------- >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases >>> >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100) >>> (pruned_words, total_words) (1, 29) >>> counters['computer'] 2 >>> counters['response_time'] 1 """ sentence_no = -1 total_words = 0 logger.info("collecting all words and their counts") vocab = defaultdict(int) min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types", sentence_no, total_words, len(vocab), ) if doc2vec: sentence = sentence.words s = [utils.any2utf8(w) for w in sentence] last_uncommon = None in_between = [] for word in s: if word not in common_terms: vocab[word] += 1 if last_uncommon is not None: components = it.chain([last_uncommon], in_between, [word]) vocab[delimiter.join(components)] += 1 last_uncommon = word in_between = [] elif last_uncommon is not None: in_between.append(word) total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info( "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences", len(vocab), total_words, sentence_no + 1) return min_reduce, vocab, total_words
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, common_terms=frozenset()): """Collect unigram/bigram counts from the `sentences` iterable. Parameters ---------- sentences : iterable of list of str The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` for such examples. max_vocab_size : int Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease `max_vocab_size` depending on how much available memory you have. delimiter : str, optional Glue character used to join collocation tokens, should be a byte string (e.g. b'_'). progress_per : int Write logs every `progress_per` sentence. common_terms : set of str, optional List of "stop words" that won't affect frequency count of expressions containing them. Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder". Return ------ (int, dict of (str, int), int) Number of pruned words, counters for each word/bi-gram and total number of words. Example ---------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases >>> >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100) >>> (pruned_words, total_words) (1, 29) >>> counters['computer'] 2 >>> counters['response_time'] 1 """ sentence_no = -1 total_words = 0 logger.info("collecting all words and their counts") vocab = defaultdict(int) min_reduce = 1 for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types", sentence_no, total_words, len(vocab), ) s = [utils.any2utf8(w) for w in sentence] last_uncommon = None in_between = [] for word in s: if word not in common_terms: vocab[word] += 1 if last_uncommon is not None: components = it.chain([last_uncommon], in_between, [word]) vocab[delimiter.join(components)] += 1 last_uncommon = word in_between = [] elif last_uncommon is not None: in_between.append(word) total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) min_reduce += 1 logger.info( "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences", len(vocab), total_words, sentence_no + 1 ) return min_reduce, vocab, total_words
def scan_vocab(self, train_pairs, progress_per=100): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") train_pair_no = -1 total_ptrees = 0 min_reduce = 1 vocab_l_raw = defaultdict(int) vocab_i = set() vocab_k = set() vocab_r = defaultdict(lambda: defaultdict(int)) for train_pair_no, (utter, myast) in enumerate(train_pairs): if train_pair_no % progress_per == 0: logger.info( "PROGRESS: at train_pair #%i. Processed %i partial trees.", train_pair_no, total_ptrees) l_tokens = utter.split() for w in l_tokens: vocab_l_raw[w] += 1 if self.max_vocab_size and len(vocab_l_raw) > self.max_vocab_size: # Pruning only applies to NL queries gsutils.prune_vocab(vocab_l_raw, min_reduce) min_reduce += 1 for ptree in myast.randomized_partial_trees( self.random, self.memsize_k, self.memsize_i): terminals, _, parent, children = ptree terminals = map(lambda x: x.getSimple(), terminals) parent = parent.getSimple() if isinstance(children, (list, tuple)): children = tuple(map(lambda x: x.getSimple(), children)) else: children = children.getSimple() vocab_k |= set(terminals) vocab_i.add(parent) vocab_r[parent][children] += 1 total_ptrees += 1 self.corpus_count = train_pair_no + 1 # this is needed somewhere else self.raw_vocab_l = vocab_l_raw self.index2word_i = sorted(vocab_i) self.index2word_i.insert(0, None) # 0th position is for the padding token self.vocab_i = dict((y, x) for (x, y) in enumerate(self.index2word_i)) self.index2word_k = sorted(vocab_k) self.index2word_k.insert(0, None) # 0th position is for the padding token self.vocab_k = dict((y, x) for (x, y) in enumerate(self.index2word_k)) self.vocab_r = {} self.index2word_r = {} self.r_idx_offset = {} self.total_r_entries = 0 for parent in sorted(vocab_r.keys()): self.r_idx_offset[parent] = self.total_r_entries self.vocab_r[parent] = {} self.index2word_r[parent] = [] # vocab_r[parent] may contain a mixture of SimpleAstNodes and tuples. To # sort them properly, we need to sort them separately. single_childrens = sorted(x for x in vocab_r[parent].keys() if isinstance(x, SimpleAstNode)) tuple_childrens = sorted(x for x in vocab_r[parent].keys() if isinstance(x, tuple)) combined_sorted_children = single_childrens + tuple_childrens assert len(combined_sorted_children) == len(vocab_r[parent]) for children in combined_sorted_children: self.vocab_r[parent][children] = Vocab( count=vocab_r[parent][children], index=len(self.index2word_r[parent])) self.index2word_r[parent].append(children) self.total_r_entries += len(self.index2word_r[parent]) """ After this point, vocab_i and vocab_k are complete. The only things left are: - scale and finalize vocab_r (because of min_count, downsampling, sorting, etc.). - build a cum_table for negative sampling of vocab_r. """ logger.info("Collected %i NL word types", len(vocab_l_raw)) logger.info("Collected %i terminal types", len(self.vocab_k)) logger.info("Collected %i non-terminal types", len(self.vocab_i)) logger.info("Created %i production tables", len(self.vocab_r)) logger.info("Collected %i total production entries", self.total_r_entries)