Python FastText.train Examples

Programming Language: Python

Namespace/Package Name: gensim.models.wrappers

Class/Type: FastText

Method/Function: train

Examples at hotexamples.com: 7

Python FastText.train - 7 examples found. These are the top rated real world Python examples of gensim.models.wrappers.FastText.train extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

load_fasttext_format(30)

FastText(8)

load(7)

load_word2vec_format(6)

train(5)

Example #1

Show file

File: embeddings.py Project: jbdatascience/voynich-translation

 def _generate_word_embeddings(self, algo=EmbeddingsAlgorithm.WORD2VEC, use_morphs=False, min_count=2, dim=100):
     """Generates the word embeddings for the current language
     
     :param use_morphs: If true, will use the morphed corpus to generate embeddings. If false, will use the raw 
     corpus
     :param min_count: The minimum number of times a word must occur in order for it to be processed
     :param dim: The number of dimensions of the output vectors
     :return: The embeddings for the current languagego
     """
     _log.info('Learning word vectors...')
     if algo == EmbeddingsAlgorithm.WORD2VEC:
         if use_morphs:
             return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count)
         else:
             return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count)
     elif algo == EmbeddingsAlgorithm.FASTTEXT:
         if use_morphs:
             self._split_corpus_into_morphs()
             self._save_language_data('fastTest_input.txt')
             return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt',
                                   output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count)
         else:
             self._save_language_data('fasttext_input.txt')
             return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt',
                                   output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count)
     else:
         _log.error('Unknown algorithm %s' % algo)

Example #2

Show file

File: word2vec_models.py Project: ling60/coies

def fasttext_model_from_file2(file_path):
    save_file_name = os.path.join(const.GENERATED_DATA_DIR, const.FASTTEXT_PREFIX + file_path.split('/')[-1])
    try:
        model = gensimFastText.load_fasttext_format(save_file_name + '.bin', encoding='utf-8')
        logging.info('model loaded:' + save_file_name)
    except FileNotFoundError:
        fastext_bin_path = os.path.join(const.ROOT_DIR, 'fasttext/fastText')
        model = gensimFastText.train(fastext_bin_path, file_path, min_count=1)
    return model.wv

Example #3

Show file

File: make_ft_vec.py Project: HarukazeP/test001

def printvec(train_path, vec_path):
	#1.小文字化など前処理したファイルを作成
	print('\nPreprpcessing training data...')
	tmp_path=train_path[:-4]+'_cleaned.txt'
	with open(train_path) as f_in:
	    with open(tmp_path, 'w') as f_out:
	        for line in f_in:
	            text=line.lower()
	            text = re.sub(r"[^a-z ]", "", text)
	            text = re.sub(r"[ ]+", " ", text)
	            f_out.write(text)
	train_path=tmp_path
	
	#2.辞書の作成
	print('\nMake dic...')
	s=set()
	with open(train_path) as f:
	    for line in f:
	        text=line.lower()
	        text = text.replace("\n", " ").replace('\r','')
	        text = re.sub(r"[ ]+", " ", text)
	        text_list=text.split(" ")
	        tmp_set=set(text_list)
	        s.update(tmp_set)

	words = sorted(list(s))
	len_words=len(words)
	word_indices = dict((c, i+1) for i, c in enumerate(words))
	indices_word = dict((i+1, c) for i, c in enumerate(words))
	# 0番目はパディング用の数字なので使わないことに注意
	
	#3.fasttextの学習
	myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext'
	ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0)
	ft_model.save(today_str+'ft.model')

	#4.ベクトルのファイル出力
	with open(vec_path, 'w') as file:
		for i in range(len_words):
		    if i!=0:
		    	word=indices_word[i]
		    	if word in ft_model.wv.vocab:
		    		vec=ft_model[word]
		    	else:
		    		vec=np.zeros((vec_size),dtype=np.float32)
		    	output=word+' > 'str(vec)+'\n'
		    	file.write(output)
		    	
	#5.モデルをリセット
	ft_model.reset_weights()

Example #4

Show file

File: word_ft_loss_graph.py Project: HarukazeP/test001

#単語から辞書IDを返す
def search_word_indices(word):
    if word in word_indices:
        return word_indices[word]
    else:
        return word_indices["#OTHER"]



#fasttextの学習
vec_size=100

print('Learning fasttext...')

myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext'
ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0)
ft_model.save(today_str+'ft.model')
# FastTextはcbowとskipgramの二つの学習方法があるがデフォルトではcbow

print_time('FastText end')


#word2vecのベクトルを得る
#未知語の場合には[0,0,0, ... ,0]みたいなやつにとりあえずしてる
#未知語は集合に格納し，あとでファイル出力
#要改良
KeyError_set=set()
def get_ft_vec(word):
    if word in ft_model.wv.vocab:
        return ft_model[word]
    else:

Example #5

Show file

File: train_word_embeddings.py Project: mikhail-barg/sberdemo

import argparse

Example #6

Show file

# Removing duplicate tags per song
clean_dataset = clean_dataset.apply(
    lambda x: ".".join(set(x.split("."))))

temp = [d.split(".") for d in clean_dataset]
sentences = [item.split(" ") for sublist in temp for item in sublist]
docs = [d.replace(".", " ") for d in clean_dataset]
fasttext = " ".join(clean_dataset).replace(".", " ")

with open('datasets/fasttext', 'w') as file:
    file.write(fasttext)

wv_model = Word2Vec(sentences, window=5, min_count=1, workers=4, batch_words=200, sg=1)
ft_model = FastText.train(
    "../fastText/fasttext",
    corpus_file="datasets/fasttext", model="skipgram", min_count=1)

'''
FAST TEXT
'''

for name, model in {"wv": wv_model, "ft": ft_model}.items():
    print(name)
    threshold = 0.90
    counts = {}
    n_counts = {}
    n_neighs = {}
    neighbours = {}

    # Build up neighbours lists

Example #7

Show file

import gensim
import os
import logging
import itertools

from gensim.models.word2vec import Text8Corpus
from gensim.models.wrappers import FastText

MODEL_FILE = './phonmodels/model4'
TEXT8_FILE = './fil9_phon'
QUIZ_FILE = './questions-words-phon.txt'

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

if os.path.isfile(MODEL_FILE):
    model = FastText.load(MODEL_FILE)

else:
    corpus = Text8Corpus(TEXT8_FILE)
    # TODO: increase size and window *separately*
    model = FastText.train('./fasttext', corpus_file=TEXT8_FILE, size=300, window=10)
    model.save(MODEL_FILE)

model.accuracy(QUIZ_FILE)