Esempio n. 1
0
    def add_vocab(self, sentences):
        """
        Merge the collected counts `vocab` into this phrase detector.

        """
        # uses a separate vocab to collect the token counts from `sentences`.
        # this consumes more RAM than merging new sentences into `self.vocab`
        # directly, but gives the new sentences a fighting chance to collect
        # sufficient counts, before being pruned out by the (large) accummulated
        # counts collected in previous learn_vocab runs.
        min_reduce, vocab, total_words = self.learn_vocab(
            sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms)

        self.corpus_word_count += total_words
        if len(self.vocab) > 0:
            logger.info("merging %i counts into %s", len(vocab), self)
            self.min_reduce = max(self.min_reduce, min_reduce)
            for word, count in iteritems(vocab):
                self.vocab[word] += count
            if len(self.vocab) > self.max_vocab_size:
                utils.prune_vocab(self.vocab, self.min_reduce)
                self.min_reduce += 1
            logger.info("merged %s", self)
        else:
            # in common case, avoid doubling gigantic dict
            logger.info("using %i counts as vocab in %s", len(vocab), self)
            self.vocab = vocab
Esempio n. 2
0
    def add_vocab(self, sentences):
        """
        Merge the collected counts `vocab` into this phrase detector.

        """
        # uses a separate vocab to collect the token counts from `sentences`.
        # this consumes more RAM than merging new sentences into `self.vocab`
        # directly, but gives the new sentences a fighting chance to collect
        # sufficient counts, before being pruned out by the (large) accummulated
        # counts collected in previous learn_vocab runs.
        min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)

        if len(self.vocab) > 0:
            logger.info("merging %i counts into %s", len(vocab), self)
            self.min_reduce = max(self.min_reduce, min_reduce)
            for word, count in iteritems(vocab):
                self.vocab[word] += count
            if len(self.vocab) > self.max_vocab_size:
                utils.prune_vocab(self.vocab, self.min_reduce)
                self.min_reduce += 1
            logger.info("merged %s", self)
        else:
            # in common case, avoid doubling gigantic dict
            logger.info("using %i counts as vocab in %s", len(vocab), self)
            self.vocab = vocab
Esempio n. 3
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                            (sentence_no, total_words, len(vocab)))
            sentence = [utils.any2utf8(w) for w in sentence]
            for bigram in zip(sentence, sentence[1:]):
                vocab[bigram[0]] += 1
                vocab[delimiter.join(bigram)] += 1
                total_words += 1

            if sentence:  # add last word skipped by previous loop
                word = sentence[-1]
                vocab[word] += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
                    (len(vocab), total_words, sentence_no + 1))
        return min_reduce, vocab
Esempio n. 4
0
    def scan_vocab(self, documents, progress_per=10000, trim_rule=None):
        logger.info("collecting all words and their counts")
        document_no = -1
        total_words = 0
        min_reduce = 1
        interval_start = default_timer() - 0.00001  # guard against next sample being identical
        interval_count = 0
        vocab = defaultdict(int)
        for document_no, document in enumerate(documents):
            if document_no % progress_per == 0:
                interval_rate = (total_words - interval_count) / (default_timer() - interval_start)
                logger.info("PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags",
                            document_no, total_words, interval_rate, len(vocab), len(self.docvecs))
                interval_start = default_timer()
                interval_count = total_words
            document_length = len(document.words)

            for tag in document.tags:
                self.docvecs.note_doctag(tag, document_no, document_length)

            for word in document.words:
                vocab[word] += 1
            total_words += len(document.words)

            if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                min_reduce += 1

        logger.info("collected %i word types and %i unique tags from a corpus of %i examples and %i words",
                    len(vocab), len(self.docvecs), document_no + 1, total_words)
        self.corpus_count = document_no + 1
        self.raw_vocab = vocab
Esempio n. 5
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_'):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                            (sentence_no, total_words, len(vocab)))
            sentence = [utils.any2utf8(w) for w in sentence]
            for bigram in zip(sentence, sentence[1:]):
                vocab[bigram[0]] += 1
                vocab[delimiter.join(bigram)] += 1
                total_words += 1

            if sentence:  # add last word skipped by previous loop
                word = sentence[-1]
                vocab[word] += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
                    (len(vocab), total_words, sentence_no + 1))
        return min_reduce, vocab
Esempio n. 6
0
    def _learn_vocab(sentences, max_vocab_size, delimiter, connector_words, progress_per):
        """Collect unigram and bigram counts from the `sentences` iterable."""
        sentence_no, total_words, min_reduce = -1, 0, 1
        vocab = defaultdict(int)
        logger.info("collecting all words and their counts")
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no, total_words, len(vocab),
                )
            start_token, in_between = None, []
            for word in sentence:
                if word not in connector_words:
                    vocab[word] += 1
                    if start_token is not None:
                        phrase_tokens = itertools.chain([start_token], in_between, [word])
                        vocab[delimiter.join(phrase_tokens)] += 1
                    start_token, in_between = word, []  # treat word as both end of a phrase AND beginning of another
                elif start_token is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i token types (unigram + bigrams) from a corpus of %i words and %i sentences",
            len(vocab), total_words, sentence_no + 1,
        )
        return min_reduce, vocab, total_words
Esempio n. 7
0
    def add_vocab(self, sentences):
        """Update model parameters with new `sentences`.

        Parameters
        ----------
        sentences : iterable of list of str
            Text corpus to update this model's parameters from.

        Example
        -------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
            >>>
            >>> # Train a phrase detector from a text corpus.
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS)  # train model
            >>> assert len(phrases.vocab) == 37
            >>>
            >>> more_sentences = [
            ...     [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
            ...     [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes'],
            ... ]
            >>>
            >>> phrases.add_vocab(more_sentences)  # add new sentences to model
            >>> assert len(phrases.vocab) == 60

        """
        # Uses a separate vocab to collect the token counts from `sentences`.
        # This consumes more RAM than merging new sentences into `self.vocab`
        # directly, but gives the new sentences a fighting chance to collect
        # sufficient counts, before being pruned out by the (large) accumulated
        # counts collected in previous learn_vocab runs.
        min_reduce, vocab, total_words = self._learn_vocab(
            sentences,
            max_vocab_size=self.max_vocab_size,
            delimiter=self.delimiter,
            progress_per=self.progress_per,
            connector_words=self.connector_words,
        )

        self.corpus_word_count += total_words
        if self.vocab:
            logger.info("merging %i counts into %s", len(vocab), self)
            self.min_reduce = max(self.min_reduce, min_reduce)
            for word, count in vocab.items():
                self.vocab[word] = self.vocab.get(word, 0) + count
            if len(self.vocab) > self.max_vocab_size:
                utils.prune_vocab(self.vocab, self.min_reduce)
                self.min_reduce += 1
        else:
            # Optimization for a common case: the current vocab is empty, so apply
            # the new vocab directly, no need to double it in memory.
            self.vocab = vocab
        logger.info("merged %s", self)
Esempio n. 8
0
    def scan_vocab(self,
                   documents,
                   progress_per=10000,
                   trim_rule=None,
                   update=False):
        logger.info("collecting all words and their counts")
        document_no = -1
        total_words = 0
        min_reduce = 1
        interval_start = default_timer(
        ) - 0.00001  # guard against next sample being identical
        interval_count = 0
        checked_string_types = 0
        vocab = defaultdict(int)
        for document_no, document in enumerate(documents):
            if not checked_string_types:
                if isinstance(document.words, string_types):
                    logger.warn(
                        "Each 'words' should be a list of words (usually unicode strings)."
                        "First 'words' here is instead plain %s." %
                        type(document.words))
                checked_string_types += 1
            if document_no % progress_per == 0:
                interval_rate = (total_words - interval_count) / (
                    default_timer() - interval_start)
                logger.info(
                    "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags, %i labels",
                    document_no, total_words, interval_rate, len(vocab),
                    len(self.docvecs), len(self.labelvecs))
                interval_start = default_timer()
                interval_count = total_words
            document_length = len(document.words)

            for tag in document.tags:
                self.docvecs.note_doctag(tag, document_no, document_length)
            for label in document.labels:
                self.labelvecs.note_doctag(label, document_no, document_length)

            for word in document.words:
                vocab[word] += 1
            total_words += len(document.words)

            if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                min_reduce += 1

        logger.info(
            "collected %i word types and %i unique tags and %i unique labels from a corpus of %i examples and %i words",
            len(vocab), len(self.docvecs), len(self.labelvecs),
            document_no + 1, total_words)
        self.corpus_count = document_no + 1
        self.raw_vocab = vocab
Esempio n. 9
0
    def add_vocab(self, sentences):
        """Update model with new `sentences`.

        Parameters
        ----------
        sentences : iterable of list of str
            Text corpus.

        Example
        -------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases
            >>> # Create corpus and use it for phrase detector
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences)  # train model
            >>> assert len(phrases.vocab) == 37
            >>>
            >>> more_sentences = [
            ...     [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
            ...     [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes']
            ... ]
            >>>
            >>> phrases.add_vocab(more_sentences)  # add new sentences to model
            >>> assert len(phrases.vocab) == 60

        """
        # uses a separate vocab to collect the token counts from `sentences`.
        # this consumes more RAM than merging new sentences into `self.vocab`
        # directly, but gives the new sentences a fighting chance to collect
        # sufficient counts, before being pruned out by the (large) accummulated
        # counts collected in previous learn_vocab runs.
        min_reduce, vocab, total_words = self.learn_vocab(
            sentences, self.max_vocab_size, self.delimiter, self.progress_per,
            self.common_terms)

        self.corpus_word_count += total_words
        if len(self.vocab) > 0:
            logger.info("merging %i counts into %s", len(vocab), self)
            self.min_reduce = max(self.min_reduce, min_reduce)
            for word, count in iteritems(vocab):
                self.vocab[word] += count
            if len(self.vocab) > self.max_vocab_size:
                utils.prune_vocab(self.vocab, self.min_reduce)
                self.min_reduce += 1
            logger.info("merged %s", self)
        else:
            # in common case, avoid doubling gigantic dict
            logger.info("using %i counts as vocab in %s", len(vocab), self)
            self.vocab = vocab
Esempio n. 10
0
    def add_vocab(self, sentences):
        """Update model with new `sentences`.

        Parameters
        ----------
        sentences : iterable of list of str
            Text corpus.

        Example
        -------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases
            >>> # Create corpus and use it for phrase detector
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences)  # train model
            >>> assert len(phrases.vocab) == 37
            >>>
            >>> more_sentences = [
            ...     [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
            ...     [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes']
            ... ]
            >>>
            >>> phrases.add_vocab(more_sentences)  # add new sentences to model
            >>> assert len(phrases.vocab) == 60

        """
        # uses a separate vocab to collect the token counts from `sentences`.
        # this consumes more RAM than merging new sentences into `self.vocab`
        # directly, but gives the new sentences a fighting chance to collect
        # sufficient counts, before being pruned out by the (large) accumulated
        # counts collected in previous learn_vocab runs.
        min_reduce, vocab, total_words = self.learn_vocab(
            sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms)

        self.corpus_word_count += total_words
        if len(self.vocab) > 0:
            logger.info("merging %i counts into %s", len(vocab), self)
            self.min_reduce = max(self.min_reduce, min_reduce)
            for word, count in iteritems(vocab):
                self.vocab[word] += count
            if len(self.vocab) > self.max_vocab_size:
                utils.prune_vocab(self.vocab, self.min_reduce)
                self.min_reduce += 1
            logger.info("merged %s", self)
        else:
            # in common case, avoid doubling gigantic dict
            logger.info("using %i counts as vocab in %s", len(vocab), self)
            self.vocab = vocab
Esempio n. 11
0
    def scan_vocab(self, sentences, progress_per=10000):
        """Do an initial scan of all words appearing in sentences."""
        logger.info("collecting all words and their counts")
        sentence_no = -1
        total_words = 0
        min_reduce = 1
        vocab = defaultdict(int)
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
                    sentence_no,
                    sum(itervalues(vocab)) + total_words,
                    len(vocab),
                )
            # TODO THIS is a change that every word's has a topic
            for word, topic in sentence:  # CHANGE
                vocab[word] += 1

            if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                total_words += utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        total_words += sum(itervalues(vocab))
        logger.info(
            "collected %i word types from a corpus of %i raw words and %i sentences",
            len(vocab),
            total_words,
            sentence_no + 1,
        )
        self.corpus_count = sentence_no + 1
        self.raw_vocab = vocab
Esempio n. 12
0
    def scan_vocab(self, sentences, progress_per=10000):
        """Do an initial scan of all words appearing in sentences."""
        logger.info("collecting all words and their counts")
        sentence_no = -1
        total_words = 0
        min_reduce = 1
        vocab = defaultdict(int)
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
                    sentence_no,
                    sum(itervalues(vocab)) + total_words, len(vocab))
            for word in sentence:
                vocab[word] += 1

            if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                total_words += utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        total_words += sum(itervalues(vocab))
        logger.info(
            "collected %i word types from a corpus of %i raw words and %i sentences",
            len(vocab), total_words, sentence_no + 1)
        self.corpus_count = sentence_no + 1
        self.raw_vocab = vocab
Esempio n. 13
0
    def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None):
        logger.info("collecting all words and their counts")
        document_no = -1
        total_words = 0
        min_reduce = 1
        interval_start = default_timer() - 0.00001  # guard against next sample being identical
        interval_count = 0
        checked_string_types = 0
        vocab = defaultdict(int)
        for document_no, document in enumerate(documents):
            if not checked_string_types:
                if isinstance(document.words, string_types):
                    logger.warning(
                        "Each 'words' should be a list of words (usually unicode strings). "
                        "First 'words' here is instead plain %s.",
                        type(document.words)
                    )
                checked_string_types += 1
            if document_no % progress_per == 0:
                interval_rate = (total_words - interval_count) / (default_timer() - interval_start)
                logger.info(
                    "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags",
                    document_no, total_words, interval_rate, len(vocab), docvecs.count
                )
                interval_start = default_timer()
                interval_count = total_words
            document_length = len(document.words)

            for tag in document.tags:
                self.note_doctag(tag, document_no, document_length, docvecs)

            for word in document.words:
                vocab[word] += 1
            total_words += len(document.words)

            if self.max_vocab_size and len(vocab) > self.max_vocab_size:
                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                min_reduce += 1

        logger.info(
            "collected %i word types and %i unique tags from a corpus of %i examples and %i words",
            len(vocab), docvecs.count, document_no + 1, total_words
        )
        corpus_count = document_no + 1
        self.raw_vocab = vocab
        return total_words, corpus_count
    def learn_vocab(sentences,
                    max_vocab_size,
                    delimiter=b'_',
                    progress_per=10000,
                    common_terms=frozenset()):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no,
                    total_words,
                    len(vocab),
                )
            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between,
                                              [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1)
        return min_reduce, vocab, total_words
Esempio n. 15
0
    def add_vocab(self, sentences):
        """
        Merge the collected counts `vocab` into this phrase detector.

        """
        # uses a separate vocab to collect the token counts from `sentences`.
        # this consumes more RAM than merging new sentences into `self.vocab`
        # directly, but gives the new sentences a fighting chance to collect
        # sufficient counts, before being pruned out by the (large) accummulated
        # counts collected in previous learn_vocab runs.
        min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter)

        logger.info("merging %i counts into %s", len(vocab), self)
        self.min_reduce = max(self.min_reduce, min_reduce)
        for word, count in iteritems(vocab):
            self.vocab[word] += count
        if len(self.vocab) > self.max_vocab_size:
            utils.prune_vocab(self.vocab, self.min_reduce)
            self.min_reduce += 1

        logger.info("merged %s", self)
Esempio n. 16
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
                    common_terms=frozenset()):
        """Collect unigram/bigram counts from the `sentences` iterable."""
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no, total_words, len(vocab),
                )
            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between, [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1
        )
        return min_reduce, vocab, total_words
Esempio n. 17
0
  def scan_vocab(self, train_pairs, progress_per=100):
    """Do an initial scan of all words appearing in sentences."""
    logger.info("collecting all words and their counts")
    train_pair_no = -1

    total_ptrees = 0
    min_reduce = 1

    vocab_l_raw = defaultdict(int)
    vocab_i = set()
    vocab_k = set()
    vocab_r = defaultdict(lambda: defaultdict(int))

    for train_pair_no, (utter, myast) in enumerate(train_pairs):
      if train_pair_no % progress_per == 0:
        logger.info("PROGRESS: at train_pair #%i. Processed %i partial trees.",
                    train_pair_no, total_ptrees)

      l_tokens = utter.split()
      for w in l_tokens:
        vocab_l_raw[w] += 1
      if self.max_vocab_size and len(vocab_l_raw) > self.max_vocab_size:
        # Pruning only applies to NL queries
        gsutils.prune_vocab(vocab_l_raw, min_reduce)
        min_reduce += 1

      for ptree in myast.randomized_partial_trees(
            self.random, self.memsize_k, self.memsize_i):
        terminals, _, parent, children = ptree
        terminals = map(lambda x: x.getSimple(), terminals)
        parent = parent.getSimple()
        if isinstance(children, (list, tuple)):
          children = tuple(map(lambda x: x.getSimple(), children))
        else:
          children = children.getSimple()

        vocab_k |= set(terminals)
        vocab_i.add(parent)
        vocab_r[parent][children] += 1

        total_ptrees += 1

    self.corpus_count = train_pair_no + 1  # this is needed somewhere else

    self.raw_vocab_l = vocab_l_raw

    self.index2word_i = sorted(vocab_i)
    self.index2word_i.insert(0, None)  # 0th position is for the padding token
    self.vocab_i = dict((y,x) for (x,y) in enumerate(self.index2word_i))

    self.index2word_k = sorted(vocab_k)
    self.index2word_k.insert(0, None)  # 0th position is for the padding token
    self.vocab_k = dict((y,x) for (x,y) in enumerate(self.index2word_k))

    self.vocab_r = {}
    self.index2word_r = {}
    self.r_idx_offset = {}
    self.total_r_entries = 0
    for parent in sorted(vocab_r.keys()):
      self.r_idx_offset[parent] = self.total_r_entries
      self.vocab_r[parent] = {}
      self.index2word_r[parent] = []

      # vocab_r[parent] may contain a mixture of SimpleAstNodes and tuples. To
      # sort them properly, we need to sort them separately.
      single_childrens = sorted(x for x in vocab_r[parent].keys()
                                if isinstance(x, SimpleAstNode))
      tuple_childrens = sorted(x for x in vocab_r[parent].keys()
                               if isinstance(x, tuple))
      combined_sorted_children = single_childrens + tuple_childrens
      assert len(combined_sorted_children) == len(vocab_r[parent])
      for children in combined_sorted_children:
        self.vocab_r[parent][children] = Vocab(
          count=vocab_r[parent][children],
          index=len(self.index2word_r[parent]))
        self.index2word_r[parent].append(children)
      self.total_r_entries += len(self.index2word_r[parent])

    """
    After this point, vocab_i and vocab_k are complete.

    The only things left are:

    - scale and finalize vocab_r (because of min_count, downsampling, sorting,
      etc.).
    - build a cum_table for negative sampling of vocab_r.
    """

    logger.info("Collected %i NL word types", len(vocab_l_raw))
    logger.info("Collected %i terminal types", len(self.vocab_k))
    logger.info("Collected %i non-terminal types", len(self.vocab_i))
    logger.info("Created %i production tables", len(self.vocab_r))
    logger.info("Collected %i total production entries", self.total_r_entries)
Esempio n. 18
0
    def learn_vocab(sentences,
                    max_vocab_size,
                    delimiter=b'_',
                    progress_per=10000,
                    common_terms=frozenset(),
                    doc2vec=False):
        """Collect unigram/bigram counts from the `sentences` iterable.
        Parameters
        ----------
        sentences : iterable of list of str
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        max_vocab_size : int
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
        progress_per : int
            Write logs every `progress_per` sentence.
        common_terms : set of str, optional
            List of "stop words" that won't affect frequency count of expressions containing them.
            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".
        Return
        ------
        (int, dict of (str, int), int)
            Number of pruned words, counters for each word/bi-gram and total number of words.
        Example
        ----------
        >>> from gensim.test.utils import datapath
        >>> from gensim.models.word2vec import Text8Corpus
        >>> from gensim.models.phrases import Phrases
        >>>
        >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
        >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100)
        >>> (pruned_words, total_words)
        (1, 29)
        >>> counters['computer']
        2
        >>> counters['response_time']
        1
        """
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no,
                    total_words,
                    len(vocab),
                )

            if doc2vec:
                sentence = sentence.words

            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between,
                                              [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1)
        return min_reduce, vocab, total_words
Esempio n. 19
0
    def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
                    common_terms=frozenset()):
        """Collect unigram/bigram counts from the `sentences` iterable.

        Parameters
        ----------
        sentences : iterable of list of str
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        max_vocab_size : int
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens, should be a byte string (e.g. b'_').
        progress_per : int
            Write logs every `progress_per` sentence.
        common_terms : set of str, optional
            List of "stop words" that won't affect frequency count of expressions containing them.
            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".

        Return
        ------
        (int, dict of (str, int), int)
            Number of pruned words, counters for each word/bi-gram and total number of words.

        Example
        ----------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases
            >>>
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> pruned_words, counters, total_words = Phrases.learn_vocab(sentences, 100)
            >>> (pruned_words, total_words)
            (1, 29)
            >>> counters['computer']
            2
            >>> counters['response_time']
            1

        """
        sentence_no = -1
        total_words = 0
        logger.info("collecting all words and their counts")
        vocab = defaultdict(int)
        min_reduce = 1
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types",
                    sentence_no, total_words, len(vocab),
                )
            s = [utils.any2utf8(w) for w in sentence]
            last_uncommon = None
            in_between = []
            for word in s:
                if word not in common_terms:
                    vocab[word] += 1
                    if last_uncommon is not None:
                        components = it.chain([last_uncommon], in_between, [word])
                        vocab[delimiter.join(components)] += 1
                    last_uncommon = word
                    in_between = []
                elif last_uncommon is not None:
                    in_between.append(word)
                total_words += 1

            if len(vocab) > max_vocab_size:
                utils.prune_vocab(vocab, min_reduce)
                min_reduce += 1

        logger.info(
            "collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences",
            len(vocab), total_words, sentence_no + 1
        )
        return min_reduce, vocab, total_words
Esempio n. 20
0
    def scan_vocab(self, train_pairs, progress_per=100):
        """Do an initial scan of all words appearing in sentences."""
        logger.info("collecting all words and their counts")
        train_pair_no = -1

        total_ptrees = 0
        min_reduce = 1

        vocab_l_raw = defaultdict(int)
        vocab_i = set()
        vocab_k = set()
        vocab_r = defaultdict(lambda: defaultdict(int))

        for train_pair_no, (utter, myast) in enumerate(train_pairs):
            if train_pair_no % progress_per == 0:
                logger.info(
                    "PROGRESS: at train_pair #%i. Processed %i partial trees.",
                    train_pair_no, total_ptrees)

            l_tokens = utter.split()
            for w in l_tokens:
                vocab_l_raw[w] += 1
            if self.max_vocab_size and len(vocab_l_raw) > self.max_vocab_size:
                # Pruning only applies to NL queries
                gsutils.prune_vocab(vocab_l_raw, min_reduce)
                min_reduce += 1

            for ptree in myast.randomized_partial_trees(
                    self.random, self.memsize_k, self.memsize_i):
                terminals, _, parent, children = ptree
                terminals = map(lambda x: x.getSimple(), terminals)
                parent = parent.getSimple()
                if isinstance(children, (list, tuple)):
                    children = tuple(map(lambda x: x.getSimple(), children))
                else:
                    children = children.getSimple()

                vocab_k |= set(terminals)
                vocab_i.add(parent)
                vocab_r[parent][children] += 1

                total_ptrees += 1

        self.corpus_count = train_pair_no + 1  # this is needed somewhere else

        self.raw_vocab_l = vocab_l_raw

        self.index2word_i = sorted(vocab_i)
        self.index2word_i.insert(0,
                                 None)  # 0th position is for the padding token
        self.vocab_i = dict((y, x) for (x, y) in enumerate(self.index2word_i))

        self.index2word_k = sorted(vocab_k)
        self.index2word_k.insert(0,
                                 None)  # 0th position is for the padding token
        self.vocab_k = dict((y, x) for (x, y) in enumerate(self.index2word_k))

        self.vocab_r = {}
        self.index2word_r = {}
        self.r_idx_offset = {}
        self.total_r_entries = 0
        for parent in sorted(vocab_r.keys()):
            self.r_idx_offset[parent] = self.total_r_entries
            self.vocab_r[parent] = {}
            self.index2word_r[parent] = []

            # vocab_r[parent] may contain a mixture of SimpleAstNodes and tuples. To
            # sort them properly, we need to sort them separately.
            single_childrens = sorted(x for x in vocab_r[parent].keys()
                                      if isinstance(x, SimpleAstNode))
            tuple_childrens = sorted(x for x in vocab_r[parent].keys()
                                     if isinstance(x, tuple))
            combined_sorted_children = single_childrens + tuple_childrens
            assert len(combined_sorted_children) == len(vocab_r[parent])
            for children in combined_sorted_children:
                self.vocab_r[parent][children] = Vocab(
                    count=vocab_r[parent][children],
                    index=len(self.index2word_r[parent]))
                self.index2word_r[parent].append(children)
            self.total_r_entries += len(self.index2word_r[parent])
        """
    After this point, vocab_i and vocab_k are complete.

    The only things left are:

    - scale and finalize vocab_r (because of min_count, downsampling, sorting,
      etc.).
    - build a cum_table for negative sampling of vocab_r.
    """

        logger.info("Collected %i NL word types", len(vocab_l_raw))
        logger.info("Collected %i terminal types", len(self.vocab_k))
        logger.info("Collected %i non-terminal types", len(self.vocab_i))
        logger.info("Created %i production tables", len(self.vocab_r))
        logger.info("Collected %i total production entries",
                    self.total_r_entries)