Ejemplo n.º 1
0
    return text


def _get_bigrams(corpus, min_count):
    text = np.copy(corpus)
    vocab = [word for sen in text for word in sen]
    ngram = [(i, j) for i, j in zip(vocab[:-1], vocab[1:])]
    freq = Counter(ngram)
    filterbi = [
        bigram for bigram in freq.most_common() if bigram[1] > min_count
    ]
    bigrams = [" ".join(bigram[0]) for bigram in filterbi]
    return bigrams


data = DataPreparation(corpus.reset_index(), f_size=3)
data.tokenize()
bigrams = _get_bigrams(data.corpus.values, min_count=700)
data.corpus = phraser(data.corpus.values)
bigrams = _get_bigrams(data.corpus.values, min_count=500)
data.corpus = phraser(data.corpus.values)

data.vocab_builder()
doc, context, target_noise_ids = data.get_data(window_size=5,
                                               num_noise_words=10)

model = DM(vec_dim=100, num_docs=len(data), num_words=data.vocab_size).cuda()

num_workers = os.cpu_count()
model.fit(doc_ids=doc,
          context=context,
Ejemplo n.º 2
0
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

import os
from torch2vec.torch2vec import DM
from torch2vec.data import DataPreparation
# pd.read_csv('../input/')

data = DataPreparation(
    '../input/recsysluc/semantic_dump.txt',
    vocab_size=int(2e5))  #vocab_size to restrict vocabulary size

data.vocab_builder()

doc, context, target_noise_ids = data.get_data(window_size=3,
                                               num_noise_words=6)

model = DM(vec_dim=100, num_docs=len(data), num_words=data.vocab_size).cuda()

num_workers = os.cpu_count()
model.fit(doc_ids=doc,
          context=context,
          target_noise_ids=target_noise_ids,
          epochs=5,
          batch_size=3000,
          num_workers=num_workers)

model.save_model(ids=data.document_ids, file_name='weights')