Example #1
0
def train_doc2vec(cfg):
    doc2vec = gensim.models.doc2vec.Doc2Vec(
        dm=0,
        dbow_words=0,
        vector_size=cfg.model.vector_size,
        window=8,
        min_count=15,
        epochs=cfg.model.epochs,
        workers=multiprocessing.cpu_count(),
        callbacks=[EpochLogger()])

    # Build corpus
    train_corpus = []
    dataset = eval(cfg.dataset.name)('train',
                                     return_report=True,
                                     return_label=False,
                                     return_image=False)
    print("Building corpus")
    for i, sample in enumerate(tqdm(dataset)):
        report = get_report(sample['report'], policy=cfg.report.report_policy)
        report = gensim.utils.simple_preprocess(report)
        train_corpus.append(gensim.models.doc2vec.TaggedDocument(report, [i]))

    # Build vocab
    doc2vec.build_vocab(train_corpus)
    print("Corpus contains " + str(len(train_corpus)) + " reports \n" +
          "Vocabulary count : " + str(len(doc2vec.wv.vocab)) + ' words \n' +
          "Corpus total words : " + str(doc2vec.corpus_total_words) +
          " words \n" + "Corpus count : " + str(doc2vec.corpus_count))
    print(len(doc2vec.docvecs))

    # Train the model
    print("Training model")
    doc2vec.train(train_corpus,
                  total_examples=doc2vec.corpus_count,
                  epochs=doc2vec.epochs)

    # Save the model
    checkpoint = os.path.join(
        cfg.outdir, 'DBOW_vector' + str(doc2vec.vector_size) + '_window' +
        str(doc2vec.window) + '_count' + str(doc2vec.vocabulary.min_count) +
        '_epoch' + str(doc2vec.epochs) + '_mimic.doc2vec')

    doc2vec.save(checkpoint)
    print("Model saved")
    return checkpoint
Example #2
0
 def forward(self, sample):
     inp = get_report(sample['report'], policy=self.cfg.report.report_policy)
     inp = self.tokenizer(inp, return_tensors="pt")
     return self.model(**inp).pooler_output.cpu().data.numpy().squeeze(0)
Example #3
0
 def forward(self, sample):
     inp = get_report(sample['report'],
                      policy=self.cfg.report.report_policy)
     inp = gensim.utils.simple_preprocess(inp)
     return self.model.infer_vector(inp)
 def forward(self, sample):
     inp = get_report(sample['report'],
                      policy=self.cfg.report.report_policy)
     inp = ' '.join(word_tokenize(inp))
     return self.model.embed_sentence(inp).squeeze(0)