Example #1
0
def main():

    skills_train = []
    for each_skill in data["clean_skills"]:
        if each_skill != ['']:
            skills_train.append(each_skill)

    embedding_size = 60
    window_size = 40
    min_word = 5
    down_sampling = 1e-2

    model = FastText(skills_train,
                     size=embedding_size,
                     window=window_size,
                     min_count=min_word,
                     sample=down_sampling,
                     sg=1,
                     iter=100)

    model.init_sims(replace=True)
    print(model)

    fname = get_tmpfile("fasttext.model")
    model.save(fname)
Example #2
0
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
Example #4
0
    def test_norm_vectors_not_saved(self):
        tmpf = get_tmpfile('gensim_fasttext.tst')
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(tmpf)
        loaded_model = FT_gensim.load(tmpf)
        self.assertTrue(loaded_model.wv.vectors_norm is None)
        self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None)

        wv = model.wv
        wv.save(tmpf)
        loaded_kv = FastTextKeyedVectors.load(tmpf)
        self.assertTrue(loaded_kv.vectors_norm is None)
        self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
Example #5
0
def train_fasttext_model(output_model_path,
                         iter_docs,
                         size=300,
                         window=8,
                         min_count=5,
                         sg=1,
                         epoch=5):
    """
    Parameters
    ----------
    output_model_path : string
        path of fastText model
    iter_docs : iterator
        iterator of documents, which are raw texts
    size : int
        size of word vector
    window : int
        window size of word2vec
    min_count : int
        minimum word count
    sg : int
        word2vec training algorithm (1: skip-gram other:CBOW)
    epoch : int
        number of epochs
    """
    logging.info("build vocabularies")

    model = FastText(size=size,
                     window=window,
                     min_count=min_count,
                     sg=sg,
                     workers=multiprocessing.cpu_count())
    model.build_vocab(iter_docs())

    logging.info("train fasttext")

    model.train(iter_docs(), total_examples=model.corpus_count, epochs=epoch)
    model.init_sims(replace=True)

    logging.info("save model")

    p = Path(output_model_path)
    if not p.parent.exists():
        p.parent.mkdir(parents=True)
    model.save(output_model_path)

    logging.info("done.")