def create_fse_model(sentences): sentences = [sent["sentence"] for sent in sentences] print("SIF create indexes for embeddings") ft = load_fasttext_model() model = SIF(ft) idx_sentences = IndexedList(sentences) model.train(idx_sentences) return model, idx_sentences
def sentence_similarity(text1,text2,similarity_threshold=0.50): sentences = [modify(text1),modify(text2)] ft = FastText(sentences, min_count=1,size=12,workers=4) model = Average(ft) model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4) sim = model.sv.similarity(0,1) if sim >= similarity_threshold: return True else: return False
def sentence_similarity(los): sentences = [modify(i) for i in los] ft = FastText(sentences, min_count=1, size=12, workers=4) model = Average(ft) model.train(IndexedList(sentences), update=True, report_delay=10, queue_factor=4) res_similar = [] for i in range(len(los) - 1): res_similar.append(model.sv.similarity(i, i + 1)) return res_similar
def sentence_similarity(self,text1,text2,similarity_threshold=0.35): sentences = [self.modify(text1),self.modify(text2)] ft = FastText(sentences, min_count=1,size=12,workers=4) model = Average(ft) try: model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4) except ZeroDivisionError as z: pass sim = model.sv.similarity(0,1) #if sim >= similarity_threshold: # return True #else: # return False return sim
def sentence_similarity(self, los, percent=0.6): sentences = [self.modify(i) for i in los] ft = FastText(sentences, min_count=1, size=12, workers=4) model = Average(ft) model.train(IndexedList(sentences), update=True, report_delay=10, queue_factor=4) res_similar = [] for i in range(len(los) - 1): res_similar.append(model.sv.similarity(i, i + 1)) if np.mean(res_similar) > 0.9: PERCENT_REDUCE = percent for i in range(len(res_similar)): res_similar[i] -= (PERCENT_REDUCE * res_similar[i]) return res_similar
def forward(self, batch): assert type(batch[0]) == list batch = IndexedList(batch) return torch.tensor(self.model.infer(batch), dtype=torch.float32)
continue # text = "" # tp_text = tuple(text.join(row["text"])) # text = text.join(row["text"]) text = row["text"].split() sentences.append(text) count += 1 if count == 20: break # tp_sentences = tuple(sentences) from fse.models import SIF from fse import IndexedList model = SIF(wvecs) sents = IndexedList(sentences) model.train(sents) # f = open("sent_embed.csv", "w") import numpy as np array = [] for i in range(len(model.sv)): for n in model.sv[i]: tmp = n print(round(tmp, 7)) exit() array.append(model.sv[i]) np.savetxt(f, array, delimiter=",") # [f.write(i) for i in model.sv]
n = int(len(short_input_text) / 50) short_input_text = " ".join([short_input_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)]) if len(short_bot_text) > 50: n = int(len(short_bot_text) / 50) short_bot_text = " ".join([short_bot_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)]) short_entity = str(value['entities']) if "scontent.xx.fbcdn.net" not in str(value['entities']) else "url" short_actions = str(value['action_1']) if "scontent.xx.fbcdn.net" not in str(value['action_1']) else "url" short_input_texts.append(short_input_text) short_bot_texts.append(short_bot_text) short_entities.append(short_entity) ft = FastText(sentences, min_count=1, size=10) model = Average(ft) model.train(IndexedList(sentences)) vectors_list = model.sv.vectors.tolist() # 10 dimensions vectors # tsne = TSNE(n_components=3) tsne = TSNE(n_components=2) tsne_vectors = tsne.fit_transform(vectors_list) # scores = [] # for k in range(2,20): # x = k # kmeans = KMeans(n_clusters=x, random_state=0) # kmeans = kmeans.fit(tsne_vectors) # labels = kmeans.labels_ # score = silhouette_score(tsne_vectors, labels) # inertia = kmeans.inertia_ # scores.append((k, score,inertia))
def encode_batch(self, texts): tokens = [self._tokenize(txt) for txt in texts] sents = IndexedList(tokens) return self.model.infer(sents)
def encode(self, text): sent = IndexedList([self._tokenize(text)]) return self.model.infer(sent).squeeze()
def calculate_embeddings(self, list): from fse import IndexedList texts = IndexedList(list) embs = self.model.infer(texts) return embs
def fit(self, list): from fse import IndexedList texts = IndexedList(list) self.model.train(texts)
chunks=(2048, None), dtype="f4") else: z_embs = z[emb_path] # encode & save if "bert" in model_name: for i, batch in enumerate(tqdm(loader)): # encode embs = batch_encode(batch, tokenizer, model).cpu().numpy() # save start = i * args.batch_size end = start + embs.shape[0] z_embs[start:end] = embs[:] elif "fse" in model_name: sent_model = SIF(model, workers=8, lang_freq="en") # train for i, batch in enumerate(loader): sentences = IndexedList([TextBlob(s).tokens for s in batch]) sent_model.train(sentences) sent_model.save(fpath.parent / "fse.model") # infer for i, batch in enumerate(loader): sentences = IndexedList([TextBlob(s).tokens for s in batch]) # encode embs = batch_encode(sentences, sent_model) # save start = i * args.batch_size end = start + embs.shape[0] z_embs[start:end] = embs[:]
sentences = list() for fileitem in filelist: print("Reading " + fileitem + "...") filepath = os.path.join(dirpath, fileitem) with open(filepath + ".txt") as f: temps = list() for a in map(lambda x: x.split(), f.read().split("\n")): temps.extend(a) sentences.append(a) print("Read " + fileitem) wvmod = gensim.downloader.load("word2vec-google-news-300") avg = Average(wvmod) avg.wvmod = gensim.downloader.load("word2vec-google-news-300") train(IndexedList(sentences)) sif = SIF(wvmod) sif.train(IndexedList(sentences)) simMat = [[0 for a in filelist] for b in filelist] for a in range(len(filelist)): for b in range(len(filelist)): sim1 = avg.sv.similarity(a, b) sim2 = sif.sv.similarity(a, b) simMat[a][b] = sim2 # simMat[a][b] = scaled_sim(sim1, sim2) for i in range(len(filelist)): print(' '.join([" "] + [str(a).center(7, ' ') for a in range(len(filelist))])) print(str(i).center(4, " "), end=" ")