def __init__(self, model_path = None): if model_path[3:] == 'vec': # If it is a pre-trained word vector ft = KeyedVectors.load_word2vec_format(model_path) self.model = SIF(ft, components=10) elif model_path[-6:] == 'pickle': # Already trained sentence vector self.model = BaseSentence2VecModel.load(model_path)
def sentence2vec(self, parentmodel=None, save=True): model_map = { "word2vec": self.word2vec(save=False), "glove": self.glove(save=False) } model = model_map.get(parentmodel, self.fasttext(save=False)) sentence_model = SIF(model) sentence_model = sentence_model.train(self.data) logging.info("Training complete. Saving model") if save: model_name = f"sentence2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec" model_path = get_path(f'/models/sentence2vec/{model_name}') return sentence_model.save(model_path) else: return sentence_model
class SIF_embeddings: def __init__(self, model_path = None): if model_path[3:] == 'vec': # If it is a pre-trained word vector ft = KeyedVectors.load_word2vec_format(model_path) self.model = SIF(ft, components=10) elif model_path[-6:] == 'pickle': # Already trained sentence vector self.model = BaseSentence2VecModel.load(model_path) def fit(self,data): inp = CSplitIndexedList(data, custom_split=split_func) self.model.train(inp) def __call__(self,transcript): return self.model.infer([(transcript.split(),0)])
def _train(self): self.sens = IndexedList(self.concepts) print('training SIF...') self.se = SIF(self.w2v_model) self.se.train(self.sens)
class MatchingModel(): def __init__(self, all_concepts, bionlp_model, rewrite=False): self.corpus = [] self.sens = [] self.w2v_model = None self.se = None # self.file_path = file_path self.concepts = all_concepts # self.model_path = model_path self.w2v_model = bionlp_model self.rewrite_mode = rewrite self._read() self._train() def _read(self): # txt_or_csv = 'csv' # if txt_or_csv == 'txt': # with open(self.file_path, 'r', encoding='utf-8') as file: # for row in file: # self.concepts.append(row.lstrip(' ').rstrip('\n')) # elif txt_or_csv == 'csv': # with open(self.file_path, 'r', encoding='utf-8') as file: # reader = csv.reader(file) # next(reader) # for row in reader: # self.corpus.append(row[0].split()) # self.concepts.append(row[1]) # self.concepts = [c.split() for c in list(set(self.concepts))] if self.rewrite_mode == True: # for procedure data set, the following 2 file paths should be replaced self.con_dic = dict(pd.read_csv('top200-con-dic-pro.csv')) self.dic = dict(pd.read_csv('top200-dic-pro.csv')) for key, value in self.con_dic.items(): self.con_dic[key] = np.array(value) for key, value in self.dic.items(): self.dic[key] = np.array(value) length = len(self.con_dic) X = np.ndarray(shape=(length, 200)) self.words = [] for key, i in zip(self.con_dic.keys(), range(length)): self.words.append(key) X[i] = self.con_dic[key] # shape --> (length, 200) self.tree = KDTree(X) print('loading model...') # self.w2v_model = gensim.models.KeyedVectors.load_word2vec_format(self.model_path, binary=True) print('loading data...') def _train(self): self.sens = IndexedList(self.concepts) print('training SIF...') self.se = SIF(self.w2v_model) self.se.train(self.sens) def query(self, query_sen, topk=25): new_sen = query_sen if self.rewrite_mode == True: new_sen = [] for w in query_sen: if w in self.dic and w not in self.con_dic: q_emb = self.dic[w].reshape(1, 200) dist, ind = self.tree.query(q_emb, k=1) if dist[0][0] < 3.6: # experiments shows that rewriting within a distance of 3.6 performs better than no-rewriting index = ind.tolist()[0] new_sen.append(self.words[index[0]]) else: new_sen.append(w) else: new_sen.append(w) # print(new_sen) cands = self.se.sv.similar_by_sentence(new_sen, model=self.se, topn=topk, indexable=self.sens.items) most_sim = [[x[0], x[2]] for x in cands] return most_sim
def __init__(self, jokes_path, model_path): self.jokes = pd.read_csv(jokes_path) self.model = SIF.load(model_path) self.prev_jokes = []
or row['role'] == '<sub-heading>' or row['role'] == '<separator>' or row['role'] == '<new-case>': continue # text = "" # tp_text = tuple(text.join(row["text"])) # text = text.join(row["text"]) text = row["text"].split() sentences.append(text) count += 1 if count == 20: break # tp_sentences = tuple(sentences) from fse.models import SIF from fse import IndexedList model = SIF(wvecs) sents = IndexedList(sentences) model.train(sents) # f = open("sent_embed.csv", "w") import numpy as np array = [] for i in range(len(model.sv)): for n in model.sv[i]: tmp = n print(round(tmp, 7)) exit() array.append(model.sv[i]) np.savetxt(f, array, delimiter=",")
def _init(self): self.model = SIF.load(self.args.model_path) self._dim = self.model.wv.vector_size
from fse import IndexedList from fse.models.average import FAST_VERSION, MAX_WORDS_IN_BATCH from fse.models import SIF from gensim.models import FastText import logging logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) w2v_model = "H:/Vietnamese word representations/Word_vector_data/VnNewsWord2Vec/VnNewsWord2Vec.bin" lookup = FastText.load_fasttext_format(w2v_model, encoding='utf-8') sentences = [] s = IndexedList(sentences) print(len(s)) title_file = 'H:/Vietnamese word representations/News-titles-embedding/Data/tokenized_titles_cleaned' with open(title_file, 'r', encoding='utf-8') as file: for line in file: sentences.append(line.split()) s = IndexedList(sentences) model = SIF(lookup, workers=2) model.train(s) model.save('sent2vec')
#!/usr/bin/env python3 import argparse import logging import pathlib from fse import IndexedLineDocument from fse.models import SIF from lib import data, utils, model import gensim.downloader as api log = logging.getLogger("train_model") EXPECTED_LINES = 66836199 if __name__ == "__main__": utils.setup_logging() parser = argparse.ArgumentParser() parser.add_argument("-d", "--data", default=data.DEFAULT_OUTPUT_PREFIX, help="Prefix of input data to read, default=" + data.DEFAULT_OUTPUT_PREFIX) parser.add_argument("-o", "--output", default=model.DEFAULT_MODEL_FILE, help="File name to save model, default=" + model.DEFAULT_MODEL_FILE) args = parser.parse_args() glove = api.load("glove-wiki-gigaword-100") input_path = pathlib.Path(args.data).with_suffix(".txt") sents = IndexedLineDocument(str(input_path)) model = SIF(glove, workers=2) model.train(sents) model.save(args.output)
chunks=(2048, None), dtype="f4") else: z_embs = z[emb_path] # encode & save if "bert" in model_name: for i, batch in enumerate(tqdm(loader)): # encode embs = batch_encode(batch, tokenizer, model).cpu().numpy() # save start = i * args.batch_size end = start + embs.shape[0] z_embs[start:end] = embs[:] elif "fse" in model_name: sent_model = SIF(model, workers=8, lang_freq="en") # train for i, batch in enumerate(loader): sentences = IndexedList([TextBlob(s).tokens for s in batch]) sent_model.train(sentences) sent_model.save(fpath.parent / "fse.model") # infer for i, batch in enumerate(loader): sentences = IndexedList([TextBlob(s).tokens for s in batch]) # encode embs = batch_encode(sentences, sent_model) # save start = i * args.batch_size end = start + embs.shape[0] z_embs[start:end] = embs[:]
for fileitem in filelist: print("Reading " + fileitem + "...") filepath = os.path.join(dirpath, fileitem) with open(filepath + ".txt") as f: temps = list() for a in map(lambda x: x.split(), f.read().split("\n")): temps.extend(a) sentences.append(a) print("Read " + fileitem) wvmod = gensim.downloader.load("word2vec-google-news-300") avg = Average(wvmod) avg.wvmod = gensim.downloader.load("word2vec-google-news-300") train(IndexedList(sentences)) sif = SIF(wvmod) sif.train(IndexedList(sentences)) simMat = [[0 for a in filelist] for b in filelist] for a in range(len(filelist)): for b in range(len(filelist)): sim1 = avg.sv.similarity(a, b) sim2 = sif.sv.similarity(a, b) simMat[a][b] = sim2 # simMat[a][b] = scaled_sim(sim1, sim2) for i in range(len(filelist)): print(' '.join([" "] + [str(a).center(7, ' ') for a in range(len(filelist))])) print(str(i).center(4, " "), end=" ") for j in range(len(filelist)):
tokens = [] for token in word_tokenize(sentence): if not_punc.match(token): tokens = tokens + prep_token(token) return tokens sentences = CSplitIndexedList(sent_a, sent_b, custom_split=prep_sentence) sentences[0] models, results = {}, {} word2vec = KeyedVectors.load("C:/Users/Kamil/Downloads/word2vec_300_3_polish.bin") models[f"CBOW-W2V"] = Average(word2vec, lang_freq="pl") models[f"SIF-W2V"] = SIF(word2vec, components=10) models[f"uSIF-W2V"] = uSIF(word2vec, length=11) from gensim.scripts.glove2word2vec import glove2word2vec glove = KeyedVectors.load_word2vec_format("C:/Users/Kamil/Downloads/glove_300_3_polish2.txt") models[f"CBOW-Glove"] = Average(glove, lang_freq="pl") print(f"After memmap {sys.getsizeof(glove.vectors)}") models[f"SIF-Glove"] = SIF(glove, components=15) models[f"uSIF-Glove"] = uSIF(glove,length=11) ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin") models[f"CBOW-FT"] = Average(ft, lang_freq="pl") models[f"SIF-FT"] = SIF(ft, components=10) models[f"uSIF-FT"] = uSIF(ft, length=11)