Esempio n. 1
0
def add_new_labels(sentences, model):
    """
    Add new labels (for new docs) to the doc2vec model's `self.vocab`.

    from: <https://gist.github.com/zseder/4201551d7f8608f0b82b>
    """
    sentence_no = -1
    total_words = 0
    vocab = model.vocab
    #model_sentence_n = len([l for l in vocab if l.startswith("SENT")])
    model_sentence_n = max(int(l.split('_')[-1]) for l in vocab if l.startswith("SENT"))
    n_sentences = 0
    for sentence_no, sentence in enumerate(sentences):
        sentence_length = len(sentence.words)
        for label in sentence.labels:
            label_e = label.split("_")
            label_n = int(label_e[1]) + model_sentence_n
            label = "{0}_{1}".format(label_e[0], label_n)
            total_words += 1
            if label in vocab:
                vocab[label].count += sentence_length
            else:
                vocab[label] = Vocab(count=sentence_length)
                vocab[label].index = len(model.vocab) - 1
                vocab[label].code = [0]
                vocab[label].sample_probability = 1.
                model.index2word.append(label)
                n_sentences += 1
    return n_sentences