def _generate_word_embeddings(self, algo=EmbeddingsAlgorithm.WORD2VEC, use_morphs=False, min_count=2, dim=100): """Generates the word embeddings for the current language :param use_morphs: If true, will use the morphed corpus to generate embeddings. If false, will use the raw corpus :param min_count: The minimum number of times a word must occur in order for it to be processed :param dim: The number of dimensions of the output vectors :return: The embeddings for the current languagego """ _log.info('Learning word vectors...') if algo == EmbeddingsAlgorithm.WORD2VEC: if use_morphs: return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count) else: return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count) elif algo == EmbeddingsAlgorithm.FASTTEXT: if use_morphs: self._split_corpus_into_morphs() self._save_language_data('fastTest_input.txt') return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt', output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count) else: self._save_language_data('fasttext_input.txt') return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt', output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count) else: _log.error('Unknown algorithm %s' % algo)
def fasttext_model_from_file2(file_path): save_file_name = os.path.join(const.GENERATED_DATA_DIR, const.FASTTEXT_PREFIX + file_path.split('/')[-1]) try: model = gensimFastText.load_fasttext_format(save_file_name + '.bin', encoding='utf-8') logging.info('model loaded:' + save_file_name) except FileNotFoundError: fastext_bin_path = os.path.join(const.ROOT_DIR, 'fasttext/fastText') model = gensimFastText.train(fastext_bin_path, file_path, min_count=1) return model.wv
def printvec(train_path, vec_path): #1.小文字化など前処理したファイルを作成 print('\nPreprpcessing training data...') tmp_path=train_path[:-4]+'_cleaned.txt' with open(train_path) as f_in: with open(tmp_path, 'w') as f_out: for line in f_in: text=line.lower() text = re.sub(r"[^a-z ]", "", text) text = re.sub(r"[ ]+", " ", text) f_out.write(text) train_path=tmp_path #2.辞書の作成 print('\nMake dic...') s=set() with open(train_path) as f: for line in f: text=line.lower() text = text.replace("\n", " ").replace('\r','') text = re.sub(r"[ ]+", " ", text) text_list=text.split(" ") tmp_set=set(text_list) s.update(tmp_set) words = sorted(list(s)) len_words=len(words) word_indices = dict((c, i+1) for i, c in enumerate(words)) indices_word = dict((i+1, c) for i, c in enumerate(words)) # 0番目はパディング用の数字なので使わないことに注意 #3.fasttextの学習 myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext' ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0) ft_model.save(today_str+'ft.model') #4.ベクトルのファイル出力 with open(vec_path, 'w') as file: for i in range(len_words): if i!=0: word=indices_word[i] if word in ft_model.wv.vocab: vec=ft_model[word] else: vec=np.zeros((vec_size),dtype=np.float32) output=word+' > 'str(vec)+'\n' file.write(output) #5.モデルをリセット ft_model.reset_weights()
#単語から辞書IDを返す def search_word_indices(word): if word in word_indices: return word_indices[word] else: return word_indices["#OTHER"] #fasttextの学習 vec_size=100 print('Learning fasttext...') myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext' ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0) ft_model.save(today_str+'ft.model') # FastTextはcbowとskipgramの二つの学習方法があるがデフォルトではcbow print_time('FastText end') #word2vecのベクトルを得る #未知語の場合には[0,0,0, ... ,0]みたいなやつにとりあえずしてる #未知語は集合に格納し,あとでファイル出力 #要改良 KeyError_set=set() def get_ft_vec(word): if word in ft_model.wv.vocab: return ft_model[word] else:
import argparse
# Removing duplicate tags per song clean_dataset = clean_dataset.apply( lambda x: ".".join(set(x.split(".")))) temp = [d.split(".") for d in clean_dataset] sentences = [item.split(" ") for sublist in temp for item in sublist] docs = [d.replace(".", " ") for d in clean_dataset] fasttext = " ".join(clean_dataset).replace(".", " ") with open('datasets/fasttext', 'w') as file: file.write(fasttext) wv_model = Word2Vec(sentences, window=5, min_count=1, workers=4, batch_words=200, sg=1) ft_model = FastText.train( "../fastText/fasttext", corpus_file="datasets/fasttext", model="skipgram", min_count=1) ''' FAST TEXT ''' for name, model in {"wv": wv_model, "ft": ft_model}.items(): print(name) threshold = 0.90 counts = {} n_counts = {} n_neighs = {} neighbours = {} # Build up neighbours lists
import gensim import os import logging import itertools from gensim.models.word2vec import Text8Corpus from gensim.models.wrappers import FastText MODEL_FILE = './phonmodels/model4' TEXT8_FILE = './fil9_phon' QUIZ_FILE = './questions-words-phon.txt' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if os.path.isfile(MODEL_FILE): model = FastText.load(MODEL_FILE) else: corpus = Text8Corpus(TEXT8_FILE) # TODO: increase size and window *separately* model = FastText.train('./fasttext', corpus_file=TEXT8_FILE, size=300, window=10) model.save(MODEL_FILE) model.accuracy(QUIZ_FILE)