def train_fasttext(corpus_folder ='assets/corpus/', target ='models/ft/', size = 300, window = 5, mincount = 100): sentences = CorpusSentences(corpus_folder) model = FastText(window=window,min_count=mincount,size=size) model.build_vocab(sentences) model.train(sentences,total_examples=model.corpus_count,epochs = 5) # store model if not os.path.exists(target): os.makedirs(target) model_fn = target + 'med_model_dim{}_win{}_min{}.bin'.format(size, window, mincount) model.save(model_fn) # test model model.most_similar('transglutaminase')
def build_and_save_word2vec_embeddings_model(): print('Connecting to the database...') sentences = SentencesIterator(tokens_generator) print('Calculating the embeddings...') model = FastText(sentences, size=100, window=10, min_count=3, workers=4) print('Saving the model...') model.save(EMBEDDINGS_WORD2VEC_MODEL_FILE) print('WORD2VEC Model saved. Examples:') interesting_words = [ 'ciao', 'salutare', 'motorino', 'simpatia', 'milano', 'roma', 'sgargapuffoparolainventata' ] for w in interesting_words: print('Words most similar to', w) print([sw[0] for sw in model.most_similar(w)]) return model
model = FastText(words_to_embed, window=2, negative=10, iter=50, sg=1, workers=4, alpha=0.005, size=300, seed=100) (model.wv.save_word2vec_format("data/training_word_vectors.bin", fvocab="data/training_word_vocab.txt", binary=False)) # In[69]: model.most_similar("diabetes") # In[70]: from sklearn.manifold import TSNE vocab = list(model.wv.vocab) X = model[vocab] tsne = TSNE(n_components=2) tsne_df = pd.DataFrame(tsne.fit_transform(X), index=vocab, columns=["comp_1", "comp_2"]) # In[71]: fig, ax = plt.subplots() fig.set_size_inches(13, 11)
from gensim.models import FastText import pandas as pd article = pd.read_excel('Cut_Finish_jieba.xlsx') sentences = article['內容'].tolist() split_sentences = [] for i in sentences: split_sentences.append(i.split(' ')) print('訓練開始') # build a Word2Vce model model = FastText(split_sentences, size=500, window=10, min_count=5, workers=4) # save model to file model.save("fastText_stock.model") # load model to python # model = Word2Vec.load("word2vec.model") print(model.most_similar("台積電", topn=5)) print(model.most_similar("鴻海", topn=5)) print(model.most_similar("中華電信", topn=5)) print(model.most_similar("仁寶", topn=5)) print(model.most_similar("兆豐金", topn=5))
import data_manager from argparse import ArgumentParser from gensim.models import FastText, Word2Vec import logging if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('-c', '--corpus', help='Corpus file', required=True) args = parser.parse_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) corpus = data_manager.ExampleCorpus(args.corpus) for line in corpus: print(line) # Gets most frequent words topk = corpus.get_topk_words(topk=100) print('Most frequent words:') for k in topk: print(k) ft = FastText(size=100, window=5, min_count=3, sentences=corpus, iter=10) for a, b in ft.most_similar('felltithio'): print(a, b)
class WordEmbedding(): def __init__(self, embedding_type="w2v", embedding_size=100, ngram=(3, 6), window_size=5, architecture="sg"): self.embedding_type = embedding_type self.window = window_size self.size = embedding_size self.model = None if architecture == "sg": self.skip_gram = True else: self.skip_gram = False if ngram is None: ngram = (3, 6) self.min_gram = ngram[0] self.max_gram = ngram[1] def train_embedding(self, sentences, n_iter=100, workers=1, min_count=3, negative_sample=1): if self.embedding_type == "w2v": train_corpus = sentences if self.model is None: self.model = Word2Vec(size=self.size, window=self.window, min_count=min_count, negative=negative_sample, workers=workers, sg=int(self.skip_gram)) self.model.build_vocab(train_corpus) # self.model.build_vocab() else: self.model.build_vocab(train_corpus, update=True) elif self.embedding_type == "ft": train_corpus = sentences if self.model is None: self.model = FastText(sg=int(self.skip_gram), size=self.size, window=self.window, min_count=min_count, min_n=self.min_gram, max_n=self.max_gram, workers=workers, negative=negative_sample) self.model.build_vocab(train_corpus) else: self.model.build_vocab(train_corpus, update=True) elif self.embedding_type == "glove": raise ValueError("GloVe training not supported use official repo") else: raise ValueError("Invalid Embedding Type") train_corpus = sentences self.model.train(train_corpus, epochs=n_iter, total_examples=self.model.corpus_count) def retrieve_vector(self, word): try: return self.model.wv[word] except KeyError: return np.random.random(self.size) def find_similar_word(self, word, n=10): try: return self.model.most_similar(positive=[word], topn=n) except KeyError: return [] def save_model(self, file_name): self.model.save("{}.model".format(file_name)) we_model_files = glob("{}.model*".format(file_name)) with ZipFile(file_name, "w") as zipf: for we_file in we_model_files: zipf.write(we_file) os.remove(we_file) def load_model(self, file_name): try: with ZipFile(file_name, "r") as zipf: zipf.extractall("/tmp/") nl = zipf.namelist() fn = [name for name in nl if name.endswith(".model")][0] path = "/tmp/" + fn except BadZipFile: path = file_name if self.embedding_type == "w2v": self.model = KeyedVectors.load_word2vec_format(path) elif self.embedding_type == "ft": self.model = FastText.load_fasttext_format(path) elif self.embedding_type == "glove": """path name: .txt file""" try: glove_file = datapath(os.path.abspath(path)) tmp_file = get_tmpfile("/tmp/g2w2v.txt") glove2word2vec(glove_file, tmp_file) self.model = KeyedVectors.load_word2vec_format(tmp_file) except UnicodeDecodeError: self.model = KeyedVectors.load(os.path.abspath(path)) self.size = self.model.wv.vector_size def remove_from_vocab(self, word_list): new_vectors = [] new_vocab = {} new_index2entity = [] new_vectors_norm = [] if self.embedding_type == "ft": self.model.wv.init_sims() for i in range(len(self.model.wv.vocab)): word = self.model.wv.index2entity[i] vec = self.model.wv.vectors[i] vocab = self.model.wv.vocab[word] vec_norm = self.model.wv.vectors_norm[i] if word not in word_list: vocab.index = len(new_index2entity) new_index2entity.append(word) new_vocab[word] = vocab new_vectors.append(vec) new_vectors_norm.append(vec_norm) self.model.wv.vocab = new_vocab self.model.wv.vectors = np.array(new_vectors) self.model.wv.index2entity = new_index2entity self.model.wv.index2word = new_index2entity self.model.wv.vectors_norm = new_vectors_norm else: self.model.init_sims() for i in range(len(self.model.vocab)): word = self.model.index2entity[i] vec = self.model.vectors[i] vocab = self.model.vocab[word] vec_norm = self.model.vectors_norm[i] if word not in word_list: vocab.index = len(new_index2entity) new_index2entity.append(word) new_vocab[word] = vocab new_vectors.append(vec) new_vectors_norm.append(vec_norm) self.model.vocab = new_vocab self.model.vectors = np.array(new_vectors) self.model.index2entity = new_index2entity self.model.index2word = new_index2entity self.model.vectors_norm = new_vectors_norm
#FastText 모델 저장 model.save("FastText.model") val_data = json.load(open("data\\val.json", "rb")) #val 데이터로부터 유추한 태그들(Type2 유형) playlist_to_tags = {} for data in val_data: # 플레이리스트만 주어진 유형(Type2)에 대해 진행 if data['plylst_title'] != "" and data['tags'] == [] and data[ 'songs'] == []: playlist_to_tags[data['plylst_title']] = [] #val.json의 type 2 문제에서 플레이리스트 제목을 띄어쓰기로 구분한 각각의 요소가 공통으로 가까운 단어를 50개 가져오기 recommends = model.most_similar( positive=[word for word in data['plylst_title'].split(" ")], topn=50) #위에서 가져온 단어(recommends) 중 태그만을 playlist_to_tags에 추가 for recommend in recommends: if recommend[0] in tags: if len(playlist_to_tags[data['plylst_title']]) == 0: playlist_to_tags[data['plylst_title']].append(recommend) else: for begin_tag in playlist_to_tags[data['plylst_title']]: sim1, sim2 = diff_2gram(begin_tag, recommend[0]) if sim1 <= 0.5 and sim2 <= 0.5: playlist_to_tags[data['plylst_title']].append( recommend) else: pass
### Create word2vec model w/ merged vocab t = time() new_wv = FastText(size=30, window=5, min_count=1, workers=3, sg=0, hs=1, negative = 10, sample=0.001, alpha=0.1) new_wv.build_vocab(sentences) ''' total_examples = new_wv.corpus_count new_wv.build_vocab([list(pubmed_wv.vocab.keys())], update=True) new_wv.intersect_word2vec_format(preTrainedPath, binary=True, lockf=1.0) ''' ### Train for 2 epochs new_wv.train(sentences, epochs=2) # , total_examples=total_examples print('Time to train the model 2 epochs: {} mins'.format(round((time() - t) / 60, 2))) print('----------------------------') print(new_wv.most_similar(positive=['treatment'])) print(new_wv.most_similar(positive=['female'])) print(new_wv.most_similar(positive=['history'])) print(new_wv.most_similar(positive=['disease'])) print(new_wv.most_similar(positive=['brain'])) new_wv.save_word2vec_format('mimic-pubmed_2.bin', binary=True) print('----------------------------') # Train for 10 epochs new_wv.train(sentences, epochs=8) # , total_examples=total_examples print('Time to train the model 10 epochs: {} mins'.format(round((time() - t) / 60, 2))) print('----------------------------') print(new_wv.most_similar(positive=['treatment'])) print(new_wv.most_similar(positive=['female'])) print(new_wv.most_similar(positive=['history']))
# print(len(text)) ct = 0 text_vocab = [] # for i,ctext in enumerate(df1["ctext"]): for text in x: text = str(text).lower() articles_tokens = [] for token in word_tokenize(text): articles_tokens.append(token) ct += 1 print(ct) text_vocab.append(articles_tokens) print(len(x)) print(len(text_vocab)) model_ted = FastText(text_vocab, size=100, window=5, min_count=5, workers=4) print(model_ted.most_similar('china')) print(model_ted.most_similar('bjp')) print(model_ted.most_similar('timeswarner')) #model_ted.save('bbc_ft.bin') print(model_ted['china']) print(model_ted['bjp']) # print(model_ted['']) # articles_tokens=[] # for text in x: # articles_tokens.append([word for word in word_tokenize(str(x).lower().replace("."," "))]) # print('') # print(x[0]) # print(articles_tokens[0 ])
from gensim.models import FastText import pandas as pd article = pd.read_excel('../article set/All_File/Cut_Finish_jieba.xlsx') sentences = article['內容'].tolist() split_sentences = [] for i in sentences: split_sentences.append(i.split(' ')) print('訓練開始') # # build a Word2Vce model model = FastText(split_sentences, size=500, window=10, min_count=5, workers=4 ,iter=10) # save model to file model.save("../Word_Embedding_model/fastText_stock.model") # load model to python # model = FastText.load("fastText_stock.model") print(model.wv['台積電']) print('台積電',model.most_similar("台積電", topn=5)) print('鴻海',model.most_similar("鴻海", topn=5)) print('中華電信',model.most_similar("中華電信", topn=5)) print('仁寶',model.most_similar("仁寶", topn=5)) print('兆豐金',model.most_similar("兆豐金", topn=5))
class FeatureBuilder: def __init__(self, ordering=['company', 'location', 'goods']): self.feature_encoder = None self.sizes = [] self.train = None self.validation = None self.ordering = ordering self.word_mapping = {} self.company_feature_encoder = None self.location_feature_encoder = None self.goods_feature_encoder = None def load(self): self.load_data() def load_model(self): try: #self.feature_encoder = FastText.load('./models/fasttext.model') self.company_feature_encoder = FastText.load( './models/company_fasttext.model') self.location_feature_encoder = FastText.load( './models/location_fasttext.model') self.goods_feature_encoder = FastText.load( './models/goods_fasttext.model') except: print('Existing model does not exist. Training from scratch') self.classType_fasttext_train('company') self.classType_fasttext_train('location') self.classType_fasttext_train('goods') #self.train_fasttext_encoder() #self.validate_encoder() def load_data(self): data = [] datasets = [get_company_data(), get_location_data(), get_items_cat()] for idx, dataset in enumerate(datasets): print('Is any entry Null?:', dataset.isnull().values.any()) for idx2, row in dataset.iterrows(): if row['name'] not in self.word_mapping: self.word_mapping[row['name']] = [] self.word_mapping[row['name']].append(self.ordering[idx]) self.sizes.append(dataset.shape[0]) data += list(dataset['name'].values) #data = shuffle(data,random_state=0) self.train, self.validation = train_test_split(data, random_state=0, test_size=0.2) print('Train Test Constructed') def classType_fasttext_train(self, classType): train_sentences = [] for word in self.train: sentence = [] mappings = self.word_mapping[word] for mapping in mappings: if mapping == classType: sentence.append(word) if len(sentence) > 0: train_sentences.append(sentence) feature_encoder = FastText(size=50, window=2, min_count=1, min_n=2, max_n=6) feature_encoder.build_vocab(sentences=train_sentences) feature_encoder.train(sentences=train_sentences, total_examples=feature_encoder.corpus_count, epochs=1000) feature_encoder.save('./models/' + classType + '_fasttext.model') if classType == 'company': self.company_feature_encoder = feature_encoder elif classType == 'location': self.location_feature_encoder = feature_encoder elif classType == 'goods': self.goods_feature_encoder = feature_encoder else: raise Exception( 'Allowed arguments are company, location and goods') #self.feature_encoder = FastText(size=25, window=1, min_count=1, sentences=train_sentences, iter=50) def train_fasttext_encoder(self): train_sentences = [] for word in self.train: mappings = self.word_mapping[word] for mapping in mappings: sentence = [word] train_sentences.append(sentence) self.feature_encoder = FastText(size=50, window=2, min_count=1, min_n=2, max_n=6) self.feature_encoder.build_vocab(sentences=train_sentences) self.feature_encoder.train( sentences=train_sentences, total_examples=self.feature_encoder.corpus_count, epochs=1000) self.feature_encoder.save('./models/fasttext.model') #self.feature_encoder = FastText(size=25, window=1, min_count=1, sentences=train_sentences, iter=50) def validate_encoder(self): test_words = self.validation ## Finding the closest cluster center (Company, Location or Good) tp = 0 for word in test_words: distances = [] encoding = self.feature_encoder[word] for order in self.ordering: category_encoding = self.feature_encoder[order] distances.append(np.linalg.norm(encoding - category_encoding)) idx = distances.index(min(distances)) gt_categories = self.word_mapping[word] for gt_category in gt_categories: if self.ordering[idx] == gt_category: tp += 1 break print('Closest cluster center validation approach accuracy:', str(tp / len(test_words))) ## Doing the K-nearest analysis tp = 0 order_idx = {} for idx, order in enumerate(self.ordering): order_idx[order] = idx for word in test_words: distances = [] encoding = self.feature_encoder[word] nearest_neighbours = self.feature_encoder.most_similar(word, topn=15) votes = [0, 0, 0] for neighbour in nearest_neighbours: mappings = self.word_mapping[neighbour[0]] for mapping in mappings: votes[order_idx[mapping]] += 1 assigned_idx = votes.index(max(votes)) gt_categories = self.word_mapping[word] for gt_category in gt_categories: if self.ordering[assigned_idx] == gt_category: tp += 1 break print('Nearest 15-Neighbour accuracy:', str(tp / len(test_words))) def one_vs_rest_generator(self, positive_index=None): assert positive_index is not None, "Requires index for the positive class(see ordering)" if self.ordering[positive_index] == 'company': feature_encoder = self.company_feature_encoder elif self.ordering[positive_index] == 'location': feature_encoder = self.location_feature_encoder elif self.ordering[positive_index] == 'goods': feature_encoder = self.location_feature_encoder else: raise Exception('Marked positive class not in the set {0,1,2}') X_train = [] y_train = [] X_test = [] y_test = [] for word in self.train: try: X_train.append(feature_encoder[word]) if self.ordering[positive_index] in self.word_mapping[word]: y_train.append(1) else: y_train.append(0) except KeyError: print( 'all ngrams for word %s absent from model. Skipping for %s' % (word, self.ordering[positive_index])) for word in self.validation: try: X_test.append(feature_encoder[word]) if self.ordering[positive_index] in self.word_mapping[word]: y_test.append(1) else: y_test.append(0) except KeyError: print( 'all ngrams for word %s absent from model. Skipping for %s' % (word, self.ordering[positive_index])) return np.asarray(X_train,dtype=np.float64),np.asarray(y_train,dtype=np.float64),\ np.asarray(X_test,dtype=np.float64),np.asarray(y_test,dtype=np.float64) def get_encoding(self, word): return self.feature_encoder[word]
def main(): input_dir = args.input_dir output_dir = args.output_dir embeddings = args.embeddings embed_type = args.embed_type similarity_check = args.similarity_check if args.emb_path: # Eval mode mode model_check = KeyedVectors.load_word2vec_format(args.emb_path, binary=False) print("Checking word similarity from: ", embeddings) for every in similarity_check: print("Most similar words for ", every) print(model_check.most_similar(every, topn=10)) print("Exiting program") sys.exit(00) if embeddings == 'word2vec': output_dir = os.path.join(output_dir, 'nep2vec') else: output_dir = os.path.join(output_dir, 'nep2ft') if not os.path.exists(output_dir): os.makedirs(output_dir) output_file = os.path.join(output_dir, 'embeddings.vec') # Training mode if not args.eval_mode: print("Training {0} model".format(embeddings)) sents = [] for root, dirs, files in os.walk(input_dir): for f in files: input_file = os.path.join(root, f) print("Processing {0}".format(input_file)) i_f = open(input_file, 'r', encoding='utf8') for line in i_f: if len(line) > 0: sents.append(line.split()) if embeddings == 'word2vec': model = Word2Vec(sents, size=300, sg=embed_type, workers=10) elif embeddings == 'fasttext': model = FastText(size=300, window=5, min_count=1) model.build_vocab(sentences=sents) total_examples = model.corpus_count model.train(sentences=sents, total_examples=total_examples, epochs=5) model.wv.save_word2vec_format(output_file, binary=False) # Eval mode mode model = KeyedVectors.load_word2vec_format(output_file, binary=False) print("Checking word similarity from: ", embeddings) for every in similarity_check: print("Most similar words for ", every) print(model.most_similar(every, topn=10)) # Print info print("Length of vocabulary ",model.wv.vectors.shape[0])
cleanedUp = textacy.preprocess.preprocess_text(txt, lowercase=True, transliterate=True, no_punct=True, no_contractions=True ) sentenceAsList = textacy.preprocess.normalize_whitespace(cleanedUp).split(' ') filteredSentence = [w for w in sentenceAsList if not w in stopWords] return filteredSentence titles = bag1.pluck('title').map(preProcessText).compute() # titles = list([t.split(' ') for t in titles]) client.close() print(time.clock() - startTime) #%% Base model startTime = time.clock() model = None model = FastText(titles[:100000], min_count=1, workers=4, sg=0) print(time.clock() - startTime) model.most_similar(positive=['cognitive']) #%% experiments startTime = time.clock() model2 = None model2 = FastText(titles, min_count=10, workers=4, sg=1, window=10,size=100) #size=300 for transfer # r1 = model.most_similar(positive=['cognitive']) # r2 = model2.most_similar(positive=['cognitive']) # print(tabulate([[r2[i][0], r1[i][0]] for i, x in enumerate(r1)], # headers=['Model2', 'Model1'], # tablefmt='orgtbl')) print(time.clock() - startTime) #%% TSNE words = ['cognitive', 'sensemaking', 'comprehension', 'reading', 'articles', 'perception', 'notetaking', 'annotation', 'foraging',