Ejemplo n.º 1
0
def train_gensim_fasttex_embedding(corpus_relative_path,
                                   emb_nm,
                                   minn=3,
                                   maxn=6,
                                   dim=100,
                                   epoch=5,
                                   lr=0.05,
                                   thread=4,
                                   max_vocab_size=200000):
    corpus_absolute_path = datapath(corpus_relative_path)
    # unsupervised training with custom parameters
    model = FastText(size=4, window=3, min_count=1)
    model.build_vocab(corpus_file=corpus_absolute_path)

    # we only select the vocab_size most frequent terms
    # TODO this should probably be emb.words = [:max_vocab_size]. Use Gensim to change format and reduce size
    # TODO ref: https://medium.com/@vasnetsov93/shrinking-fasttext-embeddings-so-that-it-fits-google-colab-cd59ab75959e
    # del emb.words[max_vocab_size:]

    # saving trained model
    model.save_model(emb_nm)