def train_fasttext(conf,seqs,out_name,n_gram): f=open(conf,'r') conf_dict=json.load(f) f.close() n_epoch=conf_dict['epoch'] window=conf_dict['window'] size=conf_dict['size'] min_count=conf_dict['min_count'] workers=conf_dict['workers'] sg=conf_dict['sg'] hs=conf_dict['hs'] min_n=conf_dict['min_n'] max_n=conf_dict['max_n'] print("创建模型...") model=FastText(size=size,window=window,min_count=min_count,sg=sg,hs=hs,workers=workers,min_n=min_n,max_n=max_n) model.build_vocab(sentences=seqs) print("词汇表共包含_"+str(len(model.wv.vocab))+"_个单词") print("开始训练词向量") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model.train(sentences=seqs,total_examples=len(seqs),epochs=n_epoch) model.save( out_name+'_'+str(n_gram)+'n_'+str(n_epoch)+'e_fasttext_model') print("训练结束!!!!") print(model.similar_by_word('AACCCC'))
def train_word_vectors(train_path, test_path, output_path): # Final Corpus X = list() # Load training data reviews fp = open(train_path, encoding='utf-8') csvreader = csv.reader(fp) # Skip over header next(csvreader) for row in tqdm(csvreader, total=161297): X.append(review_to_words(row[3])) fp.close() # Load test data reviews fp = open(test_path, encoding='utf-8') csvreader = csv.reader(fp) # Skip over header next(csvreader) for row in tqdm(csvreader, total=54766): X.append(review_to_words(row[3])) fp.close() print(len(X)) # Build and save fasttext model model = FastText(size=50, window=4, min_count=0, workers=4, sg=1) # instantiate model.build_vocab(sentences=X) model.train(sentences=X, total_examples=len(X), epochs=30) # train model.wv.save(output_path) # Let's check out some results word = "oral" results = model.similar_by_word(word) print(results[:10]) print("\n\n") # Let's check out some results word = "drug" results = model.similar_by_word(word) print(results[:10]) print("\n\n") # Let's check out some results word = "sex" results = model.similar_by_word(word) print(results[:10]) print("\n\n") # Let's check out some results word = "surgery" results = model.similar_by_word(word) print(results[:10]) print("\n\n") # Let's check out some results word = "and" results = model.similar_by_word(word) print(results[:10]) print("\n\n")