Esempio n. 1
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            logger.debug("building vocabulary from provided frequency map")
            vocab = self.vocabulary_counts
        else:
            logger.debug("default vocabulary building")
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        logger.debug("total %i word types after removing those with count<%s" %
                     (len(self.vocab), self.min_count))

        if self.hs:
            # add info about each word's Huffman encoding
            self.create_binary_tree()
        if self.negative:
            # build the table for drawing random words (for negative sampling)
            self.make_table()
        # precalculate downsampling thresholds
        self.precalc_sampling()
        self.reset_weights()
Esempio n. 2
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
          print "building vocabulary from provided frequency map"
          vocab = self.vocabulary_counts
        else:
          print "default vocabulary building"
          super(Skipgram, self).build_vocab(corpus)
          return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        self.corpus_count = len(vocab)
        self.raw_vocab = vocab

        logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        self.scale_vocab()
        self.finalize_vocab()
Esempio n. 3
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            print "building vocabulary from provided frequency map"
            vocab = self.vocabulary_counts
        else:
            print "default vocabulary building"
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        self.corpus_count = len(vocab)
        self.raw_vocab = vocab

        logger.debug("total %i word types after removing those with count<%s" %
                     (len(self.vocab), self.min_count))

        self.scale_vocab()
        self.finalize_vocab()
Esempio n. 4
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            logger.debug("building vocabulary from provided frequency map")
            vocab = self.vocabulary_counts
        else:
            logger.debug("default vocabulary building")
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        if self.hs:
            # add info about each word's Huffman encoding
            self.create_binary_tree()
        if self.negative:
            # build the table for drawing random words (for negative sampling)
            self.make_table()
        # precalculate downsampling thresholds
        self.precalc_sampling()
        self.reset_weights()
Esempio n. 5
0
def add_new_labels(sentences, model):
    """
    Add new labels (for new docs) to the doc2vec model's `self.vocab`.

    from: <https://gist.github.com/zseder/4201551d7f8608f0b82b>
    """
    sentence_no = -1
    total_words = 0
    vocab = model.vocab
    #model_sentence_n = len([l for l in vocab if l.startswith("SENT")])
    model_sentence_n = max(int(l.split('_')[-1]) for l in vocab if l.startswith("SENT"))
    n_sentences = 0
    for sentence_no, sentence in enumerate(sentences):
        sentence_length = len(sentence.words)
        for label in sentence.labels:
            label_e = label.split("_")
            label_n = int(label_e[1]) + model_sentence_n
            label = "{0}_{1}".format(label_e[0], label_n)
            total_words += 1
            if label in vocab:
                vocab[label].count += sentence_length
            else:
                vocab[label] = Vocab(count=sentence_length)
                vocab[label].index = len(model.vocab) - 1
                vocab[label].code = [0]
                vocab[label].sample_probability = 1.
                model.index2word.append(label)
                n_sentences += 1
    return n_sentences
Esempio n. 6
0
    def build_vocab(self, sentences):

        logger.info("collecting all words and their counts")
        vocab = self._vocab_from_new(sentences)

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for meta_word in [
                self.label0_as_vocab, self.label1_as_vocab,
                self.unknown_as_vocab
        ]:
            v = Vocab(count=1)
            v.index = len(self.vocab)
            v.sample_probability = 1.0
            self.index2word.append(meta_word)
            self.vocab[meta_word] = v

        # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1
        # actually, not remove any words
        # build self.vocab word->Vocab dict, and assign a unique index to each word
        for subgram, v in iteritems(vocab):
            if v.count >= self.min_count:
                v.sample_probability = 1.0
                v.index = len(self.vocab)
                self.index2word.append(subgram)
                self.vocab[subgram] = v
        logger.info("total %i word types after removing those with count<%s" %
                    (len(self.vocab), self.min_count))
        logger.info('reset weights')

        if self.hybrid_pred:
            # v is word
            # get single character word frequency
            freq_list = [
                self.vocab[v].count for v in self.vocab if len(v) == 1
            ]
            freq_list.sort(reverse=True)
            self.hybrid_threshold = freq_list[len(freq_list) / 25]
            print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold

        self.reset_weights()
Esempio n. 7
0
def create_corpus_from_matlab(word_embedding, index2word):
    model = Word2VecExtended()
    model.syn0 = word_embedding.astype(theano.config.floatX).copy()
    model.index2word = index2word
    model.index2word[0] = UnknownWord
    vocab = {}

    for word in model.index2word:
        v = Vocab(count=1)
        v.index = len(vocab)
        vocab[word] = v

    model.vocab = vocab
    model.UnknownWordIndex = model.vocab[UnknownWord].index
    return model
Esempio n. 8
0
    def build_vocab(self, sentences):

        logger.info("collecting all words and their counts")
        vocab = self._vocab_from_new(sentences)

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for meta_word in [self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab]:
            v = Vocab(count=1)
            v.index = len(self.vocab)
            v.sample_probability = 1.0
            self.index2word.append(meta_word)
            self.vocab[meta_word] = v

        # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1
        # actually, not remove any words
        # build self.vocab word->Vocab dict, and assign a unique index to each word
        for subgram, v in iteritems(vocab):
            if v.count >= self.min_count:
                v.sample_probability = 1.0
                v.index = len(self.vocab)
                self.index2word.append(subgram)
                self.vocab[subgram] = v
        logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))
        logger.info('reset weights')

        if self.hybrid_pred:
            # v is word
            # get single character word frequency
            freq_list = [self.vocab[v].count for v in self.vocab if len(v) == 1]
            freq_list.sort(reverse=True)
            self.hybrid_threshold = freq_list[len(freq_list) / 25]
            print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold

        self.reset_weights()
Esempio n. 9
0
 def add_word_to_vocab(self, word, count=1):
     v = Vocab(count=count)
     v.index = len(self.vocab)
     self.vocab[word] = v
     self.index2word.append(word)
     return v