def _generate_word_embeddings(self, algo=EmbeddingsAlgorithm.WORD2VEC, use_morphs=False, min_count=2, dim=100):
     """Generates the word embeddings for the current language
     
     :param use_morphs: If true, will use the morphed corpus to generate embeddings. If false, will use the raw 
     corpus
     :param min_count: The minimum number of times a word must occur in order for it to be processed
     :param dim: The number of dimensions of the output vectors
     :return: The embeddings for the current languagego
     """
     _log.info('Learning word vectors...')
     if algo == EmbeddingsAlgorithm.WORD2VEC:
         if use_morphs:
             return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count)
         else:
             return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count)
     elif algo == EmbeddingsAlgorithm.FASTTEXT:
         if use_morphs:
             self._split_corpus_into_morphs()
             self._save_language_data('fastTest_input.txt')
             return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt',
                                   output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count)
         else:
             self._save_language_data('fasttext_input.txt')
             return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt',
                                   output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count)
     else:
         _log.error('Unknown algorithm %s' % algo)
Example #2
0
def fasttext_model_from_file2(file_path):
    save_file_name = os.path.join(const.GENERATED_DATA_DIR, const.FASTTEXT_PREFIX + file_path.split('/')[-1])
    try:
        model = gensimFastText.load_fasttext_format(save_file_name + '.bin', encoding='utf-8')
        logging.info('model loaded:' + save_file_name)
    except FileNotFoundError:
        fastext_bin_path = os.path.join(const.ROOT_DIR, 'fasttext/fastText')
        model = gensimFastText.train(fastext_bin_path, file_path, min_count=1)
    return model.wv
Example #3
0
def printvec(train_path, vec_path):
	#1.小文字化など前処理したファイルを作成
	print('\nPreprpcessing training data...')
	tmp_path=train_path[:-4]+'_cleaned.txt'
	with open(train_path) as f_in:
	    with open(tmp_path, 'w') as f_out:
	        for line in f_in:
	            text=line.lower()
	            text = re.sub(r"[^a-z ]", "", text)
	            text = re.sub(r"[ ]+", " ", text)
	            f_out.write(text)
	train_path=tmp_path
	
	#2.辞書の作成
	print('\nMake dic...')
	s=set()
	with open(train_path) as f:
	    for line in f:
	        text=line.lower()
	        text = text.replace("\n", " ").replace('\r','')
	        text = re.sub(r"[ ]+", " ", text)
	        text_list=text.split(" ")
	        tmp_set=set(text_list)
	        s.update(tmp_set)

	words = sorted(list(s))
	len_words=len(words)
	word_indices = dict((c, i+1) for i, c in enumerate(words))
	indices_word = dict((i+1, c) for i, c in enumerate(words))
	# 0番目はパディング用の数字なので使わないことに注意
	
	#3.fasttextの学習
	myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext'
	ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0)
	ft_model.save(today_str+'ft.model')

	#4.ベクトルのファイル出力
	with open(vec_path, 'w') as file:
		for i in range(len_words):
		    if i!=0:
		    	word=indices_word[i]
		    	if word in ft_model.wv.vocab:
		    		vec=ft_model[word]
		    	else:
		    		vec=np.zeros((vec_size),dtype=np.float32)
		    	output=word+' > 'str(vec)+'\n'
		    	file.write(output)
		    	
	#5.モデルをリセット
	ft_model.reset_weights()
Example #4
0
#単語から辞書IDを返す
def search_word_indices(word):
    if word in word_indices:
        return word_indices[word]
    else:
        return word_indices["#OTHER"]



#fasttextの学習
vec_size=100

print('Learning fasttext...')

myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext'
ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0)
ft_model.save(today_str+'ft.model')
# FastTextはcbowとskipgramの二つの学習方法があるがデフォルトではcbow

print_time('FastText end')


#word2vecのベクトルを得る
#未知語の場合には[0,0,0, ... ,0]みたいなやつにとりあえずしてる
#未知語は集合に格納し,あとでファイル出力
#要改良
KeyError_set=set()
def get_ft_vec(word):
    if word in ft_model.wv.vocab:
        return ft_model[word]
    else:
import argparse
Example #6
0
# Removing duplicate tags per song
clean_dataset = clean_dataset.apply(
    lambda x: ".".join(set(x.split("."))))

temp = [d.split(".") for d in clean_dataset]
sentences = [item.split(" ") for sublist in temp for item in sublist]
docs = [d.replace(".", " ") for d in clean_dataset]
fasttext = " ".join(clean_dataset).replace(".", " ")

with open('datasets/fasttext', 'w') as file:
    file.write(fasttext)

wv_model = Word2Vec(sentences, window=5, min_count=1, workers=4, batch_words=200, sg=1)
ft_model = FastText.train(
    "../fastText/fasttext",
    corpus_file="datasets/fasttext", model="skipgram", min_count=1)

'''
FAST TEXT
'''

for name, model in {"wv": wv_model, "ft": ft_model}.items():
    print(name)
    threshold = 0.90
    counts = {}
    n_counts = {}
    n_neighs = {}
    neighbours = {}

    # Build up neighbours lists
Example #7
0
import gensim
import os
import logging
import itertools

from gensim.models.word2vec import Text8Corpus
from gensim.models.wrappers import FastText

MODEL_FILE = './phonmodels/model4'
TEXT8_FILE = './fil9_phon'
QUIZ_FILE = './questions-words-phon.txt'

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

if os.path.isfile(MODEL_FILE):
    model = FastText.load(MODEL_FILE)

else:
    corpus = Text8Corpus(TEXT8_FILE)
    # TODO: increase size and window *separately*
    model = FastText.train('./fasttext', corpus_file=TEXT8_FILE, size=300, window=10)
    model.save(MODEL_FILE)

model.accuracy(QUIZ_FILE)