def sentence_similarity(text1,text2,similarity_threshold=0.50): sentences = [modify(text1),modify(text2)] ft = FastText(sentences, min_count=1,size=12,workers=4) model = Average(ft) model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4) sim = model.sv.similarity(0,1) if sim >= similarity_threshold: return True else: return False
def sentence_similarity(los): sentences = [modify(i) for i in los] ft = FastText(sentences, min_count=1, size=12, workers=4) model = Average(ft) model.train(IndexedList(sentences), update=True, report_delay=10, queue_factor=4) res_similar = [] for i in range(len(los) - 1): res_similar.append(model.sv.similarity(i, i + 1)) return res_similar
def sentence_similarity(self,text1,text2,similarity_threshold=0.35): sentences = [self.modify(text1),self.modify(text2)] ft = FastText(sentences, min_count=1,size=12,workers=4) model = Average(ft) try: model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4) except ZeroDivisionError as z: pass sim = model.sv.similarity(0,1) #if sim >= similarity_threshold: # return True #else: # return False return sim
def sentence_similarity(self, los, percent=0.6): sentences = [self.modify(i) for i in los] ft = FastText(sentences, min_count=1, size=12, workers=4) model = Average(ft) model.train(IndexedList(sentences), update=True, report_delay=10, queue_factor=4) res_similar = [] for i in range(len(los) - 1): res_similar.append(model.sv.similarity(i, i + 1)) if np.mean(res_similar) > 0.9: PERCENT_REDUCE = percent for i in range(len(res_similar)): res_similar[i] -= (PERCENT_REDUCE * res_similar[i]) return res_similar
if len(short_input_text) > 50: n = int(len(short_input_text) / 50) short_input_text = " ".join([short_input_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)]) if len(short_bot_text) > 50: n = int(len(short_bot_text) / 50) short_bot_text = " ".join([short_bot_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)]) short_entity = str(value['entities']) if "scontent.xx.fbcdn.net" not in str(value['entities']) else "url" short_actions = str(value['action_1']) if "scontent.xx.fbcdn.net" not in str(value['action_1']) else "url" short_input_texts.append(short_input_text) short_bot_texts.append(short_bot_text) short_entities.append(short_entity) ft = FastText(sentences, min_count=1, size=10) model = Average(ft) model.train(IndexedList(sentences)) vectors_list = model.sv.vectors.tolist() # 10 dimensions vectors # tsne = TSNE(n_components=3) tsne = TSNE(n_components=2) tsne_vectors = tsne.fit_transform(vectors_list) # scores = [] # for k in range(2,20): # x = k # kmeans = KMeans(n_clusters=x, random_state=0) # kmeans = kmeans.fit(tsne_vectors) # labels = kmeans.labels_ # score = silhouette_score(tsne_vectors, labels) # inertia = kmeans.inertia_
dirpath = "./data" sentences = list() for fileitem in filelist: print("Reading " + fileitem + "...") filepath = os.path.join(dirpath, fileitem) with open(filepath + ".txt") as f: temps = list() for a in map(lambda x: x.split(), f.read().split("\n")): temps.extend(a) sentences.append(a) print("Read " + fileitem) wvmod = gensim.downloader.load("word2vec-google-news-300") avg = Average(wvmod) avg.wvmod = gensim.downloader.load("word2vec-google-news-300") train(IndexedList(sentences)) sif = SIF(wvmod) sif.train(IndexedList(sentences)) simMat = [[0 for a in filelist] for b in filelist] for a in range(len(filelist)): for b in range(len(filelist)): sim1 = avg.sv.similarity(a, b) sim2 = sif.sv.similarity(a, b) simMat[a][b] = sim2 # simMat[a][b] = scaled_sim(sim1, sim2) for i in range(len(filelist)): print(' '.join([" "] +
def prep_sentence(sentence): tokens = [] for token in word_tokenize(sentence): if not_punc.match(token): tokens = tokens + prep_token(token) return tokens sentences = CSplitIndexedList(sent_a, sent_b, custom_split=prep_sentence) sentences[0] models, results = {}, {} word2vec = KeyedVectors.load("C:/Users/Kamil/Downloads/word2vec_300_3_polish.bin") models[f"CBOW-W2V"] = Average(word2vec, lang_freq="pl") models[f"SIF-W2V"] = SIF(word2vec, components=10) models[f"uSIF-W2V"] = uSIF(word2vec, length=11) from gensim.scripts.glove2word2vec import glove2word2vec glove = KeyedVectors.load_word2vec_format("C:/Users/Kamil/Downloads/glove_300_3_polish2.txt") models[f"CBOW-Glove"] = Average(glove, lang_freq="pl") print(f"After memmap {sys.getsizeof(glove.vectors)}") models[f"SIF-Glove"] = SIF(glove, components=15) models[f"uSIF-Glove"] = uSIF(glove,length=11) ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin") models[f"CBOW-FT"] = Average(ft, lang_freq="pl") models[f"SIF-FT"] = SIF(ft, components=10) models[f"uSIF-FT"] = uSIF(ft, length=11)