Ejemplo n.º 1
0
def train_fasttext(conf,seqs,out_name,n_gram):
    f=open(conf,'r')
    conf_dict=json.load(f)
    f.close()
    n_epoch=conf_dict['epoch']
    window=conf_dict['window']
    size=conf_dict['size']
    min_count=conf_dict['min_count']
    workers=conf_dict['workers']
    sg=conf_dict['sg']
    hs=conf_dict['hs']
    min_n=conf_dict['min_n']
    max_n=conf_dict['max_n']
    print("创建模型...")
    model=FastText(size=size,window=window,min_count=min_count,sg=sg,hs=hs,workers=workers,min_n=min_n,max_n=max_n)
    model.build_vocab(sentences=seqs)
    print("词汇表共包含_"+str(len(model.wv.vocab))+"_个单词")
    print("开始训练词向量")
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model.train(sentences=seqs,total_examples=len(seqs),epochs=n_epoch)
    model.save( out_name+'_'+str(n_gram)+'n_'+str(n_epoch)+'e_fasttext_model')
    print("训练结束!!!!")
    print(model.similar_by_word('AACCCC'))
Ejemplo n.º 2
0
def train_word_vectors(train_path, test_path, output_path):

    # Final Corpus
    X = list()

    # Load training data reviews
    fp = open(train_path, encoding='utf-8')
    csvreader = csv.reader(fp)
    # Skip over header
    next(csvreader)
    for row in tqdm(csvreader, total=161297):
        X.append(review_to_words(row[3]))
    fp.close()

    # Load test data reviews
    fp = open(test_path, encoding='utf-8')
    csvreader = csv.reader(fp)
    # Skip over header
    next(csvreader)
    for row in tqdm(csvreader, total=54766):
        X.append(review_to_words(row[3]))
    fp.close()

    print(len(X))

    # Build and save fasttext model
    model = FastText(size=50, window=4, min_count=0, workers=4,
                     sg=1)  # instantiate
    model.build_vocab(sentences=X)
    model.train(sentences=X, total_examples=len(X), epochs=30)  # train
    model.wv.save(output_path)

    # Let's check out some results
    word = "oral"
    results = model.similar_by_word(word)
    print(results[:10])

    print("\n\n")

    # Let's check out some results
    word = "drug"
    results = model.similar_by_word(word)
    print(results[:10])

    print("\n\n")

    # Let's check out some results
    word = "sex"
    results = model.similar_by_word(word)
    print(results[:10])

    print("\n\n")

    # Let's check out some results
    word = "surgery"
    results = model.similar_by_word(word)
    print(results[:10])

    print("\n\n")

    # Let's check out some results
    word = "and"
    results = model.similar_by_word(word)
    print(results[:10])

    print("\n\n")