def classType_fasttext_train(self, classType): train_sentences = [] for word in self.train: sentence = [] mappings = self.word_mapping[word] for mapping in mappings: if mapping == classType: sentence.append(word) if len(sentence) > 0: train_sentences.append(sentence) feature_encoder = FastText(size=50, window=2, min_count=1, min_n=2, max_n=6) feature_encoder.build_vocab(sentences=train_sentences) feature_encoder.train(sentences=train_sentences, total_examples=feature_encoder.corpus_count, epochs=1000) feature_encoder.save('./models/' + classType + '_fasttext.model') if classType == 'company': self.company_feature_encoder = feature_encoder elif classType == 'location': self.location_feature_encoder = feature_encoder elif classType == 'goods': self.goods_feature_encoder = feature_encoder else: raise Exception( 'Allowed arguments are company, location and goods')
def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5, cbow=True, iterations=5, seed=0, workers=1): """ :param sentences: :param size: :param min_count: :param window: :param negative: :param cbow: boolean to determine the training type; True is for CBOW; False is for Skip-gram :param iterations: :param seed: :param workers: :return: """ if cbow is True: skip = 0 else: skip = 1 model = FastText(size=size, window=window, min_count=min_count, workers=workers, sg=skip, negative=negative, seed=seed) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=iterations) return model
def main(): start = time.time() #CLI arguments parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--corpus_file", type=str, required=True, help="Path to the corpus .txt file.") parser.add_argument("--window", type=int, default=5, help="Window size. default=5") parser.add_argument("--vector_size", type=int, default=200, help="Vector dimensions. default=200") parser.add_argument("--min_count", type=int, default=1, help="The model ignores all words with total frequency lower than this. default=1") parser.add_argument("--method", type=str, required=True, help="[sg|cbow]") parser.add_argument("--output_file", type=str, default=r'../Models/FastText_model/FastText_combined_corp_model.wordvectors', help="Path to the output .wordvectors file.") args = parser.parse_args() #set method method = 1 if args.method == "sg": method = 1 elif args.method == "cbow": method = 0 else: print("method not supported!") return #setup model model = FastText(vector_size=args.vector_size, window=args.window, min_count=args.min_count, sg=method) model.build_vocab(corpus_file=args.corpus_file) print("Built vocab. Starting train...") #train model.train(corpus_file=args.corpus_file, total_words=model.corpus_total_words, epochs=5) model.wv.save(args.output_file) print ("Done training fasttext in %.2f min" % ((time.time()-start)/60))
def train_fasttext(sentences, embedding_size=100, window=5, sg=1, hs=0, min_ct=2, min_n=1, max_n=4, ns_exponent=0.75, negative=15, epoch=50, sample_t=1e-5): start_time = time.time() ftmodel = FastText(size=embedding_size, window=window, sg=sg, hs=hs, negative=negative, sample=sample_t, ns_exponent=ns_exponent, min_n=min_n, max_n=max_n, min_count=min_ct, workers=12, seed=7) ftmodel.build_vocab(sentences=sentences) ftmodel.train(sentences=sentences, epochs=epoch, total_examples=ftmodel.corpus_count) training_time = time.time() - start_time print("%d seconds used to train this model" % (training_time)) return ftmodel
def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5, cbow=True, iterations=5, seed=0, workers=1): """ Returns: the trained model """ if cbow is True: sg = 0 else: sg = 1 model = FastText(size=size, window=window, min_count=min_count, workers=workers, sg=sg, negative=negative, seed=seed) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=iterations) return model
def train_vector_model(train_data_list, train): if train: str_buf = train_data_list['encode'] joinString = ' '.join(str_buf) pos1 = kiwi_f.k_pos(joinString) pos2 = ' '.join( list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: kiwi_f.k_morphs(x), pos2)) print("BUILD MODEL") model = FastText( size=300, window=3, workers=8, min_count=1, sg=1, #skipgram 모델의 성능이 더 좋다고 알려져있음 iter=1000) model.build_vocab(morphs) print("BUILD COMPLETE") print("TRAIN START") model.train(morphs, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True) if not os.path.exists(path.FASTTEXT_DIR): os.makedirs(path.FASTTEXT_DIR) model.save(path.model_path + 'model_test') print("TRAIN COMPLETE") return model else: return FastText.load(path.model_path + 'model_test')
def train_vector_model(train_data_list, mode): if mode == 'train': mecab = Okt() str_buf = train_data_list['encode'] joinString = ' '.join(str_buf) pos1 = mecab.pos(joinString) pos2 = ' '.join( list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: mecab.morphs(x), pos2)) print("BUILD MODEL") model = FastText(size=vector_size, window=3, workers=8, min_count=1, sg=1, iter=1000) model.build_vocab(morphs) print("BUILD COMPLETE") print("TRAIN START") model.train(morphs, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True) if not os.path.exists('./fasttext'): os.makedirs('./fasttext') model.save('./fasttext/model') print("TRAIN COMPLETE") return model else: return FastText.load('./fasttext/model')
def make_fasttext(target_dataset): corpus_path = os.path.join(CONFIG.DATASET_PATH, target_dataset, "corpus.txt") sentences = word2vec.LineSentence(corpus_path) dimension_size = 300 print("embedding started") embedding_model = FastText(sentences=sentences, size=dimension_size, window=6, min_count=5, workers=4, sg=1) #skip-gram embedding_model = FastText(size=dimension_size, window=6, min_count=5, workers=4, sg=1) #skip-gram embedding_model.build_vocab(sentences=sentences) embedding_model.train(sentences=sentences, total_examples=embedding_model.corpus_count, epochs=10) model_name = "FASTTEXT_" + target_dataset + ".model" #pad_value = np.finfo(np.float32).eps pad_value = 1. embedding_model.wv.add("<PAD>", np.full(embedding_model.vector_size, pad_value), replace=True) embedding_model.wv.init_sims(replace=True) embedding_model.wv.save(os.path.join(CONFIG.EMBEDDING_PATH, model_name)) print("embedding completed")
def train_vector_model(datas, train): path = configs.fasttext_path if train: mecab = Okt() str_buf = datas['encode'] joinString = ' '.join(str_buf) pos1 = mecab.pos(joinString) pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: mecab.morphs(x), pos2)) print("BUILD MODEL") model = FastText(size=vector_size, window=3, workers=8, min_count=2, sg=1, iter=1500) model.build_vocab(morphs) print("BUILD COMPLETE") print("TRAIN START") model.train(morphs, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True) if not os.path.exists(path): os.makedirs(path) model.save(path + 'model_v2') print("TRAIN COMPLETE") return model else: print("LOAD SAVED MODEL") return FastText.load(path + 'model_v2')
def run(self, df: pd.DataFrame) -> pd.DataFrame: """ Transform dataframe with missing values to DF of aggregated embeddings for each sample the final embeddings for sample is mean embedding vector of all embeddings vector of non missing values :param df: DF with missing data :return: embeddings DF """ # encode features to string df_intrv = utilities.intervals(df, inplace=False) # concatenate values to sequence seq_df = utilities.concat_columns(df_intrv) # min_n=3, max_n=2 to avoid char gram size = df.shape[1] emb_size = 50 window = 30 if size < 60: emb_size = int(size - (1 + ((size // 10) ** 1.5 + (size // 10)))) window = int(size - ((size // 10) ** 2 + (size // 10))) model = FastText(size=emb_size, window=window, min_count=1, workers=-1, min_n=3, max_n=2) # instantiate model.build_vocab(sentences=seq_df) model.train(sentences=seq_df, total_examples=len(seq_df), epochs=50) # for high missingness level some rows don't include values impute most frequence in that case idxmax = df_intrv.apply(lambda col: col.value_counts()[0], axis=0).idxmax() idxmax = df_intrv[idxmax].value_counts().idxmax() seq_df = pd.Series([x if len(x) > 0 else [idxmax] for x in seq_df]) # get embeddings sequences emb = seq_df.apply(lambda row: model.wv[row].mean(axis=0).tolist()) # create DF with emb dims as columns emb_df = pd.DataFrame(np.row_stack(emb)) emb_df.columns = ['x' + str(x) for x in emb_df.columns] return emb_df
def fetch_emb(lenWindow, minOccur, emb_path, vocab_path, RESET): if not os.path.exists(emb_path) or RESET: with open('../data/cub/text_trainvalclasses.txt', 'r') as file: text = file.read() sentences = sent_tokenize(text) texts = [] for i, line in enumerate(sentences): words = word_tokenize(line) texts.append(words) model = FastText(size=300, window=lenWindow, min_count=minOccur) model.build_vocab(sentences=texts) model.train(sentences=texts, total_examples=len(texts), epochs=10) with open(vocab_path, 'rb') as file: vocab = json.load(file) i2w = vocab['i2w'] base = np.ones((300,), dtype=np.float32) emb = [base * (i - 1) for i in range(3)] for word in list(i2w.values())[3:]: emb.append(model[word]) emb = np.array(emb) with open(emb_path, 'wb') as file: pickle.dump(emb, file) else: with open(emb_path, 'rb') as file: emb = pickle.load(file) return emb
def train_fasttext(sentences, embedding_size=100, window=5, sg=1, hs=0, min_ct=2, min_n=2, max_n=4, ns_exponent=0.75, negative=15, epoch=50, sample=1e-5): ftmodel = FastText(size=embedding_size, window=window, sg=sg, hs=hs, negative=negative, sample=sample, ns_exponent=ns_exponent, min_n=min_n, max_n=max_n, min_count=min_ct, workers=8, seed=7) ftmodel.build_vocab(sentences=sentences) ftmodel.train(sentences=sentences, epochs=epoch, total_examples=ftmodel.corpus_count) return ftmodel
def main(): """ script to training fastText word embedding model """ parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input-file', required=False, default=config.DATA_FILE, help='input data file for training') parser.add_argument('-m', '--model-file', required=False, default=config.MODEL_FILE, help='model output name') parser.add_argument('-s', '--embedding-size', required=False, type=int, default=config.MODEL_FILE, help='model output name') args = parser.parse_args() model = FastText(size=args.embedding_size, sg=1) model.build_vocab(corpus_file=args.input_file) total_words = model.corpus_total_words model.train(corpus_file=args.input_file, total_words=total_words, epochs=5) model.save(args.model_file)
def loadDevblogModel(self, embedding_dim, epochs, window, min_count): """ Devblog 데이터를 기반으로 FastText 단어 임베딩 모델 학습 - input : embedding_dim / int / 단어 벡터화시 차원 수 : epochs / int / 학습 횟수 : window / int / 학습에 사용될 n-gram : min_count / int / 학습에 사용될 단어의 최소 등장횟수 - return : we_model """ model = None if not os.path.isfile(CONST.devblog_model_path): print('🐈 학습된 단어 임베딩 모델이 없습니다.') dc = Document() docs = dc.getDocs(labeled_only=False) # 전체 데이터 가져오기 print('🐈 단어 임베딩 모델 학습을 시작합니다.') sentences = docs.text.apply(lambda x: [han2Jamo(s) for s in x.split(' ')]) model = FastText(size=embedding_dim, window=window, min_count=min_count) model.build_vocab(sentences=sentences) model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs) print('🐈 단어 임베딩 모델을 저장합니다.') model.save(CONST.devblog_model_path) else: model = FastText.load(CONST.devblog_model_path) return model
def getWordVec(corpus: list, type=1) -> object: ''' Args: corpus: list[list[str]], each sublist indicates a sentence type: 1 = word2vec, 2 = fasttext ''' coherenceMetric = Callback(ConvergenceMetric()) convergenceMetric = Callback(CoherenceMetric()) diffMetric = Callback(DiffMetric()) if type == 1: model = Word2Vec(corpus) model.save('word2vec.model') return model else: model = FastText(min_count=1) logging.info('Starting building vocabulary table') model.build_vocab(corpus) logging.info('Starting training') model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs, callbacks=[epochLogger]) model.save('FastText.model') return model
def recipe_to_ingre2vec(filepath='../recipe_data/'): recipe_sentences = [] # csv파일로 부터 읽어온 documents 저장 recipe_folder = os.listdir(filepath) # recipe_data폴더에 들어있는 파일(폴더) 목록 list for i, folder in enumerate(recipe_folder): csv_filepath = os.listdir(filepath + folder) # 해당 폴더에 들어있는 csv파일 목록 list for j, csv_file in enumerate(csv_filepath): fi = open(filepath + folder + '/' + csv_file, 'rt', encoding='UTF8') rdr = csv.reader(fi) for k, row in enumerate(rdr): if k == 0: continue elif k % 2 == 0: recipe_sentences.append(row) fi.close() model_ingredient = FastText( sg=1, window=10 * 1000000, vector_size=100, min_count=3) # item2vec로 사용하기 위해 windowsize를 크게 설정 model_ingredient.build_vocab(recipe_sentences) model_ingredient.train(recipe_sentences, epochs=10, total_examples=model_ingredient.corpus_count) print("length of ingex2vec: %i" % (len(model_ingredient.wv.index_to_key))) print("Ingre2vec embedding finished!") return model_ingredient
def create_embedding(caption_file: str, vocab_file: str, embed_size: int, output: str, **fasttext_kwargs): caption_df = pd.read_json(caption_file) caption_df["tokens"] = caption_df["tokens"].apply(lambda x: ["<start>"] + [token for token in x] + ["<end>"]) sentences = list(caption_df["tokens"].values) vocabulary = torch.load(vocab_file, map_location="cpu") epochs = fasttext_kwargs.get("epochs", 10) model = FastText(size=embed_size, min_count=1, **fasttext_kwargs) model.build_vocab(sentences=sentences) model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs) word_embeddings = np.zeros((len(vocabulary), embed_size)) with tqdm(total=len(vocabulary), ascii=True) as pbar: for word, idx in vocabulary.word2idx.items(): if word == "<pad>" or word == "<unk>": continue word_embeddings[idx] = model.wv[word] pbar.update() np.save(output, word_embeddings) print("Finish writing fasttext embeddings to " + output)
def fast_text_word_model(data, col_name): ''' Creating word vectors using fastText library. Args: data (DataFrame): The first parameter. col_name (string): The second parameter. Returns: model (word embedding model) ''' sentences_list = list() delimeter = '|' for index, row in data.iterrows(): if row[col_name] != None: # Creating a list of text sentences_list.extend(row[col_name].split(delimeter)) strip_list = [item.strip() for item in sentences_list] # Removing duplicates entry from strip_list sentences = list(set(strip_list)) # Creating FastText object model = FastText(size=100, window=10, min_count=1, workers=4, sg=1) # adding list of sentences to model model.build_vocab(sentences) total_words = model.corpus_total_words # Training the model to generate the vectors model.train(sentences, total_words=total_words, epochs=100) return model
def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5, cbow=True, iterations=5, seed=0, workers=1): """ creates and trains a genism fastText model: - sentences: list of sentences to be trained on - size: dimensionality of the embedding layer - min_count: minimum number of occurrences of a word for use in training - window: maximum distance between the current and predicted word within a sentence - negative: size of negative sampling - cbow: boolean to determine the training type; True is for CBOW; False is for Skip-gram - iterations: number of iterations to train over - seed: seed for the random number generator - workers: number of worker threads to train the model Returns: the trained model """ skip = 1 if cbow: skip = 0 model = FastText(size=size, window=window, min_count=min_count, workers=workers, sg=skip, negative=negative, seed=seed) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=iterations) return model
def test_check_pre_train_dtypes(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float64) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32) se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float16) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32) se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=int) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) se.word_weights = np.ones(len(se.wv.vocab), dtype=bool) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32)
class FastTextMeanEmbeddingVectorizer(object): def __init__(self, size=100, window=5, min_count=2, workers=4, epochs=100): super().__init__() self.model = FastText(size=size, window=window, min_count=min_count, workers=workers, sg=1) self.size = size self.epochs = epochs def fit(self, X, y=None): self.model.build_vocab(sentences=X) self.model.train(sentences=X, total_examples=len(X), epochs=self.epochs) # train return self def transform(self, X): result = [] for words in X: if len(words) >0 : mean = np.mean([self.model.wv[w] for w in words], axis=0) else: mean = [np.zeros(self.size)] result.append(mean) return np.array(result) def fit_transform(self, X, y=None): self.fit(X, y) return self.transform(X)
def test_check_pre_train_san_no_sv_vecs(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.sv.vectors = None with self.assertRaises(RuntimeError): se._check_pre_training_sanity(1, 1, 1)
def test_check_pre_train_san_incos_len(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.word_weights = np.ones(20) with self.assertRaises(RuntimeError): se._check_pre_training_sanity(1, 1, 1)
def test_cy_equal_np_ft_random(self): ft = FastText(size=20, min_count=1) ft.build_vocab(SENTENCES) m1 = Average(ft) m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) m1._pre_train_calls() from fse.models.average_inner import MAX_NGRAMS_IN_BATCH m1.batch_ngrams = MAX_NGRAMS_IN_BATCH mem1 = m1._get_thread_working_mem() o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1) m2 = Average(ft) m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) m2._pre_train_calls() mem2 = m2._get_thread_working_mem() from fse.models.average_inner import train_average_cy o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2) self.assertEqual(o1, o2) self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
def test_save_load_with_memmap(self): ft = FastText(min_count=1, size=5) ft.build_vocab(SENTENCES) shape = (1000, 1000) ft.wv.vectors = np.zeros(shape, np.float32) p = Path("fse/test/test_data/test_emb") p_vecs = Path("fse/test/test_data/test_emb_wv.vectors") p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors") p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors") p_not_exists = Path("fse/test/test_data/test_emb.wv.vectors.npy") se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p)) self.assertTrue(p_vecs.exists()) self.assertTrue(p_ngrams.exists()) self.assertTrue(p_vocab.exists()) se.save(str(p.absolute())) self.assertTrue(p.exists()) self.assertFalse(p_not_exists.exists()) se = BaseSentence2VecModel.load(str(p.absolute())) self.assertFalse(se.wv.vectors_vocab.flags.writeable) self.assertEqual(shape, se.wv.vectors.shape) self.assertEqual((2000000, 5), se.wv.vectors_ngrams.shape) for p in [p, p_vecs, p_ngrams, p_vocab]: p.unlink()
def train_fasttext(vec_op=utils.average): model = FastText() # build the vocabulary model.build_vocab(corpus_file='data/corpus/sec_corpus.txt') # train the model model.train(corpus_file='data/corpus/sec_corpus.txt', epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) documents_tokens = io_manager.read_documents_for_word2vec() if not os.path.exists('./data/fasttext'): os.makedirs('./data/fasttext') with open('./data/fasttext/document_vectors.txt', 'w', encoding="utf8") as f: for doc in documents_tokens: doc_vec = vec_op([model.wv[word] for word in doc[1]]) f.write( doc[0].strip().replace('data_by_sect', 'data_orig_by_sect') + ' ' + ' '.join(map(str, doc_vec)) + '\n') return model
def create_new_model(model_name): word_vec_num_dim = 4 model = FastText(size=word_vec_num_dim, window=3, min_count=1) model.build_vocab(sentences=common_texts) model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) return model
def create_model(skip_gram, tokenized_sentences, model_path): model = FastText(min_count=1, window=5, sg=skip_gram) model.build_vocab(sentences=tokenized_sentences) model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=100) model.save(model_path) return model
def embedding(corpus, model_type): if model_type == "fasttext": model = FastText(workers=10) if model_type == "word2vec": model = word2vec.Word2Vec(workers=10) model.build_vocab(corpus) model.train(corpus, total_examples=len(corpus), epochs=10) return model
def train_model(sentences: Collection[str], save_path=MODEL_PATH): model = FastText(size=VECTOR_SIZE) model.build_vocab(sentences=sentences) model.train(sentences=sentences, total_examples=model.corpus_count, epochs=50) model.save(save_path) return model