def fasttext_proc(params): line_sentence = LineSentence(config.words_file) model = fasttext.FastText(line_sentence, size=params['vector_dim'], window=params['window_size'], min_count=params['min_frequency'], workers=params['workers'], sg=params['use_skip_gram'], hs=params['use_hierarchical_softmax'], negative=params['negative_size'], iter=params['pre_proc_epochs']) return model
def train_word2vec(x): # 訓練 word to vector 的 word embedding #model = word2vec.Word2Vec(x, size=500, window=5, min_count=5, workers=12, iter=10, sg=1) model = fasttext.FastText(min_count=5, size=500) model.build_vocab(x) model.train(x, total_examples=200000, epochs=10) return model
def train_w2v(config): try: print("word2vec train start") update_flag = False model = fasttext.FastText(size=300, window=5, min_count=1, workers=4) with open(config.pos_path) as f: for line in f.readlines(): if update_flag == False: model.build_vocab([line.split(' ')], update=False) update_flag = True else: model.build_vocab([line.split(' ')], update=True) with open(config.pos_path) as f: for line in f.readlines(): for _ in range(100): model.train(line.split(' '), total_examples=model.corpus_count, epochs=model.epochs) os.makedirs(config.embedding_model_path, exist_ok=True) model.save(''.join([config.embedding_model_path, '/', 'model'])) return model except Exception as e: print(Exception("error on train w2v : {0}".format(e))) finally: print("word2vec train done")
def trainModel_fasttext(train_sen, model_output): # sentences = GetSentences(file_input) # yield sentences # sentences = list(sentences) word2vec_model = fasttext.FastText(train_sen, sg=SG, min_count=MIN_COUNT, workers=CPU_NUM, size=VEC_SIZE, window=CONTEXT_WINDOW) word2vec_model.save(model_output)
def create_new_model(cls, corpus_path, pmodel_name, epochs=5, pmin_count=10, psize=150, installdir=''): """ Creates and trains (and optionally saves) a model using gensim's implementation of the fastText algorithm, and then loads the KeyedVectors associated with that model. For CREATION/first time training only. To continue training an already existing model, use update_model(). Parameters ----------- corpus_path (str) - path to the corpus you wish to train the model with pmodel_name (str) - the name to be assigned to the model when saved. Must be unique or error will be raised to avoid overwriting an existing model epochs (int, optional) - Number of times to iterate over training corpus during training pmin_count (int, optional) - Minimum frequency for a word to be used in training psize (int, optional) - Size of vectors for training Returns: ----------- True if model created/trained, False if could not be created Throws ----------- FileNotFoundError - If corpus_path not found RuntimeError - If training an already existing model that makes it past first if statement. This is because build_vocab raises RuntimeError if building existing vocab without update=True (see update_model) """ if installdir != '': model_path = installdir + IKFastTextModeling.__PATH_PREFIX__ if pmodel_name[-4:] != '.bin': pmodel_name = pmodel_name + '.bin' if os.path.exists(os.path.join(model_path, pmodel_name)): raise FileExistsError( "Model named {} already exists, model could not be created". format(pmodel_name[:-4])) model = ft.FastText(vector_size=psize, sg=1, min_count=pmin_count) super().create_new_model(corpus_path, model, epochs) ft.save_facebook_model(model, path=os.path.join(model_path, pmodel_name)) return True
def train_fasttext(self, data): self.logger.info('train fasttext....') self.logger.info(f'word vector size is: {self.size}') self.model = fasttext.FastText(data, sg=self.sg, iter=self.iter, seed=self.seed, size=self.size, window=self.window, workers=self.workers, min_count=self.min_count, word_ngrams=self.word_ngrams)
def getFastTextModel(train='', load='', modelname='', min_word=200): if train != '': # train model print(train[:10]) model = fasttext.FastText(sentences=train, min_count=min_word) model.save('word_embeddings/fasttext/models/' + modelname + '.model.bin') # pickle the entire model to load and resume training later return model elif load != '': model = fasttext.FastText.load('word_embeddings/fasttext/models/' + load) return model
def word2vec(self): sentences = word2vec.LineSentence(self.ast_file) model = fasttext.FastText(sentences, size=para.chunk_len - 1, window=3, min_count=1, iter=10, min_n=3, max_n=6, word_ngrams=0, max_vocab_size=932) # model = word2vec.Word2Vec(sentences, size=self.astdim) model.save(u"ast.model") return model
def fasttext_model_gensim(sentences): # 使用gensim创建fastText模型 model = fasttext_gensim.FastText(sentences, size=200, window=6, min_count=1, iter=10, min_n=3, max_n=6) print(model.wv["体育"]) # 词向量获得的方式 print(model["体育"]) # 词向量获得的方式 print(model.wv.word_vec("体育")) # 词向量获得的方式 model.save("./fastText1.kpt")
def model_fasttext(text, params): """ generate a fasttext model from a text (list of sentences) :param text: text, as a list of sentences (strings) :param params: dictionary of parameter space for word2vec :return: trained encoder model for fasttext """ train_text = [clean_text(s).split() for s in text] model = fasttext.FastText(**params) model.build_vocab(train_text) model.train(train_text, total_examples=model.corpus_count, epochs=model.iter) return model
def fit(self, documents=None): self.documents = documents _resumes_words_list = self.__get_sentences_tokens(self.documents) self.model_tfidf = TfidfVectorizer() self.model_tfidf.fit(self.documents) self.model_word2vec = fasttext.FastText(_resumes_words_list, negative=5, workers=4, iter=self.iter, min_count=self.min_count) self.word_vectors = self.model_word2vec.wv.syn0 self.model_cluster = GaussianMixture(n_components=self.n_components) self.model_cluster.fit(self.word_vectors)
def train_wv(self, merge_seg_data_fpath): ''' 训练词向量 :param merge_seg_data_fpath: str 训练词向量的数据 :return: ''' # 训练词向量 if 'word2vec' == self.wv_type: self.wv_model = word2vec.Word2Vec( LineSentence(merge_seg_data_fpath), min_count=self.wv_config['min_count'], size=self.wv_config['size']) elif 'fasttext' == self.wv_type: self.wv_model = fasttext.FastText( LineSentence(merge_seg_data_fpath), min_count=self.wv_config['min_count'], size=self.wv_config['size'])
def train(self): self.check_no_data() pre_trained_model = fasttext.FastText(seed=self.seed, sg=self.sg, alpha=self.alpha, size=self.size, window=self.window, min_count=self.min_count, min_n=self.min_n, max_n=self.max_n, iter=self.iter) pre_trained_model.build_vocab(sentences=self.sentences) pre_trained_model.train(sentences=self.sentences, total_examples=pre_trained_model.corpus_count, epochs=5) self.model = pre_trained_model
def fasttext_embeddings(Y, notes_file, embedding_size, min_count, n_iter): modelname = "processed_%s.fasttext" % (Y) sentences = ProcessedIter(Y, notes_file) model = fasttext.FastText(size=embedding_size, min_count=min_count, iter=n_iter) print("building fasttext vocab on %s..." % (notes_file)) model.build_vocab(sentences) print("training...") model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) out_file = '/'.join(notes_file.split('/')[:-1] + [modelname]) print("writing embeddings to %s" % (out_file)) model.save(out_file) return out_file
def train_fast_text(data_paths="", data=None, model_paths="", model_save_path="", epochs=1, option="create"): model = None start = time.time() if option == "load": print('Loading FastText model...') model = fasttext.FastText.load(model_paths) else: print("Paths reads", len(data_paths)) if option == "create": print('Creating FastText model...') model = fasttext.FastText(size=300, window=10, sg=1, sample=1e-5, workers=multiprocessing.cpu_count(), callbacks=[BatchLogger(model_save_path)]) model.build_vocab(NextSentMem(data)) print("Vocabulary is builded!", time.time() - start, len(model.wv.vocab)) model.train(NextSentMem(data), epochs=epochs, total_examples=model.corpus_count, compute_loss=True, report_delay=1.0, callbacks=[BatchLogger(model_save_path)]) model.save(model_save_path.format("Base")) elif option == "retrain": print('Retraining FastText model...') model = fasttext.FastText.load(model_paths) model.train(NextSentMem(data), epochs=epochs, total_examples=model.corpus_count, compute_loss=True, report_delay=1.0, callbacks=[BatchLogger(model_save_path)]) model.save(model_save_path.format("Ret")) print("Process ended!", time.time() - start) return model
def train_unsup(): print("start train gensim fasttext unsup model") model = fasttext.FastText(size=256, window=5, min_count=1, word_ngrams=0, workers=8) # scan over corpus to build the vocabulary model.build_vocab(corpus_file=fileConfig.dir_fasttext + fileConfig.file_fasttext_unsup_train_data) total_words = model.corpus_total_words # number of words in the corpus print('train...') model.train(corpus_file=fileConfig.dir_fasttext + fileConfig.file_fasttext_unsup_train_data, total_words=total_words, epochs=3) model.save(fileConfig.dir_fasttext + fileConfig.file_fasttext_gensim_unsup_model) print("success train gensim fasttext unsup model")
def get_model(self, hs=1, negative=5, cbow_mean=0, iter=10, size=100, min_count=5, max_vocab_size=1000000, workers=3, articles_to_learn=1000, randomTrain=False): dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) if (self.dev_mode): sentences1 = MySentences( dir_path + '/DataSet') # Gets all files from folder at location. else: print( "Training model, be aware this is on a real trainingset, so it might take a while" ) sentences1 = ZippedSentences( dir_path + '/RealDataSet/wiki_flat.zip', articles_to_learn, randomTrain ) #Make train-data from a large sample of data using articles_to_learn articles Fast_Text_model = fasttext.FastText( sentences=sentences1, # Sentences to train from sg=1, # 0 for CBOW, 1 for Skip-gram hs= hs, # 1 for hierarchical softmax and 0 and non-zero in negative argument then negative sampling is used. negative= negative, # 0 for no negative sampling and above specifies how many noise words should be drawn. (Usually 5-20 is good). cbow_mean= cbow_mean, # 0 for sum of context vectors, 1 for mean of context vectors. Only used on CBOW. iter=iter, # number of epochs. size=size, # feature vector dimensionality min_count=min_count, # minimum frequency of words required max_vocab_size= max_vocab_size, # How much RAM is allowed, 10 million words needs approx 1GB RAM. None = infinite RAM workers=workers, # How many threads are started for training. ) self.model = Fast_Text_model return Fast_Text_model
def main(): corpus_file = "data.txt" iter_count = 1 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) f = open("data/%s" % corpus_file, "r") text = f.read() sentences = [s.split(" ") for s in text.split("\n")] for i in range(1, 2): model = fasttext.FastText(min_count=1, seed=1, workers=1, iter=iter_count) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.save("model/fasttext_gensim_iter=100_%s.model" % i)
def get_model(self): zips = ZippedSentences( 'wiki_flat.zip', self.articles_to_learn ) #Extract x number of articles from training set. Fast_Text_model = fasttext.FastText( sentences=zips, #Sentences to train from sg=1, #0 for CBOW, 1 for Skip-gram hs= 1, #1 for hierarchical softmax and 0 and non-zero in negative argument then negative sampling is used. negative= 1, #0 for no negative sampling and above specifies how many noise words should be drawn. (Usually 5-20 is good). iter=10, #number of epochs. size=100, #feature vector dimensionality min_count=5, #minimum frequency of words required max_vocab_size= None, #How much RAM is allowed, 10 million words needs approx 1GB RAM. None = infinite RAM workers=3, #How many threads are started for training. min_n= 3, #Minimum length of char n-grams for word representations, (4 means a word of 5 will be split into 4 parts, an extra beginning part and end part is added to words) max_n=6, #Maximum length of char n-grams word_ngrams=1 #1 means using char n-grams, 0 equals word2vec. ) return Fast_Text_model
def fastText_train(train_file, save_model_name): ''' 训练词向量模型(fastText版) INPUT -> 训练语料地址, 模型保存的名称 ''' corpus_path = FILE_DIR + '/' + train_file model_path = FILE_DIR + '/' + save_model_name + '.bin' corpus_file = datapath(corpus_path) model = fasttext.FastText( corpus_file, # 训练语料 sg=1, # 1是skip-gram算法(对低频词敏感),0是CBOW算法 size=150, # 是输出词向量的维数,一般取100-200间(太小会导致映射冲突,太大消耗内存) window=5, # 句子中当前词语目标词之间的最大距离(前看n个词,后看n个词) min_count=1, # 对词进行过滤,小于n的词会被忽视,默认为5 alpha=0.025, # 学习率 workers=4, # 并发训练时候的线程数,仅当Cython安装的情况下才会起作用 iter=5, # 训练周期,默认是5 ) # model.save(FILE_DIR+'/'+save_model_name) # 以二进制类型保存模型以便重用 model.wv.save_word2vec_format(model_path, binary=True)
def ft(model_name, iter_count): """ fasttext """ print("prepare data.") os.chdir("data") set_data(mode="word") corpus_file = "tmp.txt" f = open("%s" % corpus_file, "r", encoding="utf-8") text = f.read() sentences = [s.split(" ") for s in text.split("\n")] print("train model.") # workers=1にしなければseed固定は意味がない(ドキュメントより) model = fasttext.FastText(min_count=1, seed=1, workers=1, iter=iter_count) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) print("save model.") os.chdir("..") model.save("model/%s" % model_name)
def result(self): return fasttext.FastText(sentences=self._sentence_gen).wv
for line in f_5: line = line.rstrip() sentences.append(tokenizer.tokenize(line)) print("Finished File 5") for line in f_6: line = line.rstrip() sentences.append(tokenizer.tokenize(line)) print("Finished File 6") #count tokens, each one being a sentence token_count = sum([len(sentence) for sentence in sentences]) print("The Sinhala corpus contains {0:,} tokens".format(token_count)) #define fasttext model. User sg (0 or 1) argument to choose between CBOW and Skipgram model = fasttext.FastText(size=300, window=10, min_count=1, workers=8, sg=1) model.build_vocab(sentences=sentences) #train fasttext model model.train(sentences=sentences, total_examples=len(sentences), epochs=50) #save model if not os.path.exists("trained_fasttext_300_nsw"): os.makedirs("trained_fasttext_300_nsw") model.save(os.path.join("trained_fasttext_300_nsw", "fasttext_100_nsw.w2v"))
def train_fasttext_model(infile_name, outfile_name=None, dim=100, ws=4, min_count=3, n_jobs=1, minn=1, maxn=2, method='cbow', epoch=30): """ training fasttext (Parallel2vec) model on corpus file extracted from molecules - parameters in FastText https://fasttext.cc/docs/en/options.html - parameters in gensim https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb - parameters of fasttext in gensim vs original FastText sg=0 means using 'cbow' model, size means dim, window means ws, iter means epoch, min_count means minCount, min_n means minn, max_n means maxn :param infile_name: Path to the file on disk, a file that contains sentences(one line = one sentence). Words must be already preprocessed and separated by whitespace. :param outfile_name: :param dim: size of word vectors [100] :param ws: size of the context window [4] :param min_count: minimal number of word occurrences [3] :param n_jobs: :param minn: min length of char ngram [1] :param maxn: max length of char ngram [2] :param method: skip-gram / cbow [cbow] :param epoch: number of epochs [30] :return: fasttext model """ if method.lower() == 'skip-gram': sg = 1 elif method.lower() == 'cbow': sg = 0 else: raise ValueError('skip-gram or cbow are only valid options') start = timeit.default_timer() model = fasttext.FastText(sg=sg, size=dim, window=ws, min_count=min_count, min_n=minn, max_n=maxn, workers=n_jobs) # model = word2vec.Word2Vec(corpus, size=vector_size, window=window, min_count=min_count, workers=n_jobs, sg=sg, # **kwargs) # corpus = word2vec.LineSentence(infile_name) print('>>> Start to read molecular sentences...') model.build_vocab(corpus_file=infile_name) print('Count of molecular sentences: {}, count of unique fragment: {}'. format(model.corpus_count, len(model.wv.vocab))) print('>>> Start to training model...') abc = model.train(corpus_file=infile_name, total_examples=model.corpus_count, epochs=epoch, total_words=len(model.wv.vocab)) try: print('return values of model training: {}'.format(abc)) except: pass if outfile_name: # fname = get_tmpfile("fasttext.model") model.save(outfile_name) stop = timeit.default_timer() print('Runtime: ', round((stop - start) / 60, 2), ' minutes') return model
from gensim.models import word2vec from gensim.models import fasttext import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus('text8/text8') model = fasttext.FastText(sentences) model.save('FT8/fasttext.model')