def main(): skills_train = [] for each_skill in data["clean_skills"]: if each_skill != ['']: skills_train.append(each_skill) embedding_size = 60 window_size = 40 min_word = 5 down_sampling = 1e-2 model = FastText(skills_train, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100) model.init_sims(replace=True) print(model) fname = get_tmpfile("fasttext.model") model.save(fname)
def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
def test_norm_vectors_not_saved(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(tmpf) loaded_model = FT_gensim.load(tmpf) self.assertTrue(loaded_model.wv.vectors_norm is None) self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None) wv = model.wv wv.save(tmpf) loaded_kv = FastTextKeyedVectors.load(tmpf) self.assertTrue(loaded_kv.vectors_norm is None) self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
def train_fasttext_model(output_model_path, iter_docs, size=300, window=8, min_count=5, sg=1, epoch=5): """ Parameters ---------- output_model_path : string path of fastText model iter_docs : iterator iterator of documents, which are raw texts size : int size of word vector window : int window size of word2vec min_count : int minimum word count sg : int word2vec training algorithm (1: skip-gram other:CBOW) epoch : int number of epochs """ logging.info("build vocabularies") model = FastText(size=size, window=window, min_count=min_count, sg=sg, workers=multiprocessing.cpu_count()) model.build_vocab(iter_docs()) logging.info("train fasttext") model.train(iter_docs(), total_examples=model.corpus_count, epochs=epoch) model.init_sims(replace=True) logging.info("save model") p = Path(output_model_path) if not p.parent.exists(): p.parent.mkdir(parents=True) model.save(output_model_path) logging.info("done.")