Esempio n. 1
0
def add_new_labels(sentences, model):
    """
    Add new labels (for new docs) to the doc2vec model's `self.vocab`.

    from: <https://gist.github.com/zseder/4201551d7f8608f0b82b>
    """
    sentence_no = -1
    total_words = 0
    vocab = model.vocab
    #model_sentence_n = len([l for l in vocab if l.startswith("SENT")])
    model_sentence_n = max(int(l.split('_')[-1]) for l in vocab if l.startswith("SENT"))
    n_sentences = 0
    for sentence_no, sentence in enumerate(sentences):
        sentence_length = len(sentence.words)
        for label in sentence.labels:
            label_e = label.split("_")
            label_n = int(label_e[1]) + model_sentence_n
            label = "{0}_{1}".format(label_e[0], label_n)
            total_words += 1
            if label in vocab:
                vocab[label].count += sentence_length
            else:
                vocab[label] = Vocab(count=sentence_length)
                vocab[label].index = len(model.vocab) - 1
                vocab[label].code = [0]
                vocab[label].sample_probability = 1.
                model.index2word.append(label)
                n_sentences += 1
    return n_sentences
Esempio n. 2
0
    def build_vocab(self, sentences):

        logger.info("collecting all words and their counts")
        vocab = self._vocab_from_new(sentences)

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for meta_word in [
                self.label0_as_vocab, self.label1_as_vocab,
                self.unknown_as_vocab
        ]:
            v = Vocab(count=1)
            v.index = len(self.vocab)
            v.sample_probability = 1.0
            self.index2word.append(meta_word)
            self.vocab[meta_word] = v

        # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1
        # actually, not remove any words
        # build self.vocab word->Vocab dict, and assign a unique index to each word
        for subgram, v in iteritems(vocab):
            if v.count >= self.min_count:
                v.sample_probability = 1.0
                v.index = len(self.vocab)
                self.index2word.append(subgram)
                self.vocab[subgram] = v
        logger.info("total %i word types after removing those with count<%s" %
                    (len(self.vocab), self.min_count))
        logger.info('reset weights')

        if self.hybrid_pred:
            # v is word
            # get single character word frequency
            freq_list = [
                self.vocab[v].count for v in self.vocab if len(v) == 1
            ]
            freq_list.sort(reverse=True)
            self.hybrid_threshold = freq_list[len(freq_list) / 25]
            print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold

        self.reset_weights()
Esempio n. 3
0
    def build_vocab(self, sentences):

        logger.info("collecting all words and their counts")
        vocab = self._vocab_from_new(sentences)

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for meta_word in [self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab]:
            v = Vocab(count=1)
            v.index = len(self.vocab)
            v.sample_probability = 1.0
            self.index2word.append(meta_word)
            self.vocab[meta_word] = v

        # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1
        # actually, not remove any words
        # build self.vocab word->Vocab dict, and assign a unique index to each word
        for subgram, v in iteritems(vocab):
            if v.count >= self.min_count:
                v.sample_probability = 1.0
                v.index = len(self.vocab)
                self.index2word.append(subgram)
                self.vocab[subgram] = v
        logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))
        logger.info('reset weights')

        if self.hybrid_pred:
            # v is word
            # get single character word frequency
            freq_list = [self.vocab[v].count for v in self.vocab if len(v) == 1]
            freq_list.sort(reverse=True)
            self.hybrid_threshold = freq_list[len(freq_list) / 25]
            print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold

        self.reset_weights()