Esempio n. 1
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            logger.debug("building vocabulary from provided frequency map")
            vocab = self.vocabulary_counts
        else:
            logger.debug("default vocabulary building")
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        logger.debug("total %i word types after removing those with count<%s" %
                     (len(self.vocab), self.min_count))

        if self.hs:
            # add info about each word's Huffman encoding
            self.create_binary_tree()
        if self.negative:
            # build the table for drawing random words (for negative sampling)
            self.make_table()
        # precalculate downsampling thresholds
        self.precalc_sampling()
        self.reset_weights()
Esempio n. 2
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            print "building vocabulary from provided frequency map"
            vocab = self.vocabulary_counts
        else:
            print "default vocabulary building"
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        self.corpus_count = len(vocab)
        self.raw_vocab = vocab

        logger.debug("total %i word types after removing those with count<%s" %
                     (len(self.vocab), self.min_count))

        self.scale_vocab()
        self.finalize_vocab()
Esempio n. 3
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
          print "building vocabulary from provided frequency map"
          vocab = self.vocabulary_counts
        else:
          print "default vocabulary building"
          super(Skipgram, self).build_vocab(corpus)
          return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        self.corpus_count = len(vocab)
        self.raw_vocab = vocab

        logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        self.scale_vocab()
        self.finalize_vocab()
Esempio n. 4
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            logger.debug("building vocabulary from provided frequency map")
            vocab = self.vocabulary_counts
        else:
            logger.debug("default vocabulary building")
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        if self.hs:
            # add info about each word's Huffman encoding
            self.create_binary_tree()
        if self.negative:
            # build the table for drawing random words (for negative sampling)
            self.make_table()
        # precalculate downsampling thresholds
        self.precalc_sampling()
        self.reset_weights()