def run(sub_task=1): documents = [ "Éste texto no tiene nada que ver con los demás", "La plata fue entregada en camiones color plata", "El cargamento de oro llegó en un camión. El cargamento de oro llegó en un camión. El cargamento de oro llegó en un camión", "Cargamentos de oro dañados por el fuego", "El cargamento de oro llegó en un camión" ] query = ["oro plata camión"] if sub_task <= 2: text_vectorizer, text_vector = vectorizer.vectorize(documents) query_vector = text_vectorizer.transform(query) if sub_task == 1: distances = np.array([np.linalg.norm(text_vector[i].toarray() - query_vector.toarray()) for i in range(text_vector.shape[0])]) elif sub_task == 2: distances = np.array([cosine_distance(text_vector[i].toarray()[0], query_vector.toarray()[0]) for i in range(text_vector.shape[0])]) elif sub_task >= 3: if sub_task == 3: text_vectorizer, text_vector = vectorizer.vectorize(documents, stop_words=stopwords.spanish) elif sub_task == 4: text_vectorizer, text_vector = vectorizer.vectorize(documents, stop_words=stopwords.spanish, tokenizer=SpanishTokenizer()) elif sub_task == 5: text_vectorizer, text_vector = vectorizer.tf_idf_vectorize(documents, stop_words=stopwords.spanish, tokenizer=SpanishTokenizer()) query_vector = text_vectorizer.transform(query) distances = np.array([cosine_distance(text_vector[i].toarray()[0], query_vector.toarray()[0]) for i in range(text_vector.shape[0])]) min_distance = np.argmin(distances) print("Documento mas parecido: {0}.\nDistancia: {1}\nTexto del documento:\n{2}".format(min_distance, np.amin(distances), documents[min_distance]))
def Clustering(orig, minclusters, maxclusters) : '''returns (distortion score, number of clusters, cluster assignment)''' # perform clustering clusterer = GAAClusterer() clusterer.cluster(orig) vrc = [] # calculate distortions wb = len(orig) centroid = numpy.mean(orig, axis=0) for vector in orig : wb -= cosine_distance(vector, centroid) lowerbound = minclusters if lowerbound < 2 : lowerbound = 2 for k in range(lowerbound, maxclusters + 1) : clusterer.update_clusters(k) gaac = [] ww = len(orig) for vector in orig : maxcos = None for j in range(k) : clust = clusterer._centroids[j] cdist = cosine_distance(vector, clust) if not maxcos or cdist > maxcos[0] : maxcos = (cdist, j) ww -= maxcos[0] gaac.append(maxcos[1]) vrc.append(((wb/(k - 1)) / (ww/(len(orig) - k)), k, gaac)) khat = (float("inf"), vrc[0][1], vrc[0][2]) for k in range(1, len(vrc) - 1) : dist = (vrc[k+1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k-1][0]) if dist < khat[0] : khat = (dist, vrc[k][1], vrc[k][2]) return khat
def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] dist = cosine_distance(vector, centroid) if not best or dist < best[0]: best = (dist, i) return best[1]
def sentence_similarity(sent1, sent2, stopwords): sent1 = [w.lower() for w in sent1] sent2 = [w.lower() for w in sent2] all_words = list(set(sent1 + sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # build the vector for the first sentence for w in sent1: if w in stopwords: continue vector1[all_words.index(w)] += 1 # build the vector for the second sentence for w in sent2: if w in stopwords: continue vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2)
def sentence_similarity(s1, s2, stopwords=None): if stopwords is None: stopwords = [] s1 = [a.lower() for a in s1] s2 = [a.lower() for a in s2] all_words = list(set(s1 + s2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) for w in s1: if w in stopwords: continue vector1[all_words.index(w)] += 1 for w in s2: if w in stopwords: continue vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2)
def __sentence_similarity(sent1, sent2, stopwords=None): if stopwords is None: stopwords = [] sent1 = [w.lower() for w in sent1] sent2 = [w.lower() for w in sent2] all_words = list(set(sent1 + sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) for w in sent1: if w in stopwords: continue vector1[all_words.index(w)] += 1 for w in sent2: if w in stopwords: continue vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2)
def finding_similarity(line1, line2, stop=None): if stop is None: stop = list() line1 = [word.lower() for word in line1] line2 = [word.lower() for word in line2] all_words = list(set(line1 + line2)) check1 = [0] * len(all_words) check2 = [0] * len(all_words) for word in line1: if word in stop: continue check1[all_words.index(word)] += 1 for word in line2: if word in stop: continue check2[all_words.index(word)] += 1 return 1 - cosine_distance(check1, check2)
def sentence_similarity(self, vector1, vector2, id_1, id_2, text_words_count, stopwords=None): r = vector1.shape[1] vector1 = np.array(np.reshape(vector1, (r, 1))) vector2 = np.array(np.reshape(vector2, (r, 1))) v1 = [vector1[i][0] for i in xrange(r)] v2 = [vector2[i][0] for i in xrange(r)] wc = text_words_count[-1] #print sum(v1), sum(v2) if sum(v1) == 0. or sum(v2) == 0.: return abs(text_words_count[id_2] - text_words_count[id_1]) * 0.25 / wc else: return (1 - cosine_distance(v1, v2)) * 0.75 + abs( text_words_count[id_2] - text_words_count[id_1]) * 0.25 / wc
def similar_sentence(self, sent1, sent2): if self.stop_words is None: self.stop_words = list() sent1 = list(map(lambda d: d.lower(), sent1)) sent2 = list(map(lambda m: m.lower(), sent2)) total_sentence = list(set(sent1 + sent2)) vect1 = [0] * len(total_sentence) vect2 = [0] * len(total_sentence) for e in sent1: if e in self.stop_words: continue vect1[total_sentence.index(e)] += 1 for m in sent2: if m in self.stop_words: continue vect2[total_sentence.index(m)] += 1 return 1 - cosine_distance(vect1, vect2)
def sentence_similarity(sent1, sent2): """ 计算两个句子之间的相似性 :param sent1: :param sent2: :return: """ all_words = list(set(sent1 + sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) for word in sent1: vector1[all_words.index(word)] += 1 for word in sent2: vector2[all_words.index(word)] += 1 # print(sent1) # print(sent2) # print('vector1:{}'.format(vector1)) # print('vector2:{}'.format(vector2)) # cosine_distance 越大越不相似 return 1 - cosine_distance(vector1, vector2)
def cluster_vectorspace(self, vectors, trace=False): # variables describing the initial situation N = len(vectors) cluster_len = [1]*N cluster_count = N index_map = numpy.arange(N) # construct the similarity matrix dims = (N, N) dist = numpy.ones(dims, dtype=numpy.float)*numpy.inf for i in range(N): for j in range(i+1, N): dist[i, j] = cosine_distance(vectors[i], vectors[j]) while cluster_count > max(self._num_clusters, 1): i, j = numpy.unravel_index(dist.argmin(), dims) if trace: print("merging %d and %d" % (i, j)) # update similarities for merging i and j self._merge_similarities(dist, cluster_len, i, j) # remove j dist[:, j] = numpy.inf dist[j, :] = numpy.inf # merge the clusters cluster_len[i] = cluster_len[i]+cluster_len[j] self._dendrogram.merge(index_map[i], index_map[j]) cluster_count -= 1 # update the index map to reflect the indexes if we # had removed j index_map[j+1:] -= 1 index_map[j] = N self.update_clusters(self._num_clusters)
def cluster_vectorspace(self, vectors, trace=False): # variables describing the initial situation N = len(vectors) cluster_len = [1] * N cluster_count = N index_map = numpy.arange(N) # construct the similarity matrix dims = (N, N) dist = numpy.ones(dims, dtype=numpy.float) * numpy.inf for i in range(N): for j in range(i + 1, N): dist[i, j] = cosine_distance(vectors[i], vectors[j]) while cluster_count > max(self._num_clusters, 1): i, j = numpy.unravel_index(dist.argmin(), dims) if trace: print("merging %d and %d" % (i, j)) # update similarities for merging i and j self._merge_similarities(dist, cluster_len, i, j) # remove j dist[:, j] = numpy.inf dist[j, :] = numpy.inf # merge the clusters cluster_len[i] = cluster_len[i] + cluster_len[j] self._dendrogram.merge(index_map[i], index_map[j]) cluster_count -= 1 # update the index map to reflect the indexes if we # had removed j index_map[j + 1 :] -= 1 index_map[j] = N self.update_clusters(self._num_clusters)
def sentence_similarity(sentence1, sentence2, stopwords=None): if stopwords is None: stopwords = [] sentence1 = [word.lower() for word in sentence1] sentence2 = [word.lower() for word in sentence2] all_words = list(set(sentence1 + sentence2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # building vector for the first sentence for word in sentence1: if word in stopwords: continue vector1[all_words.index(word)] += 1 # building vector for the second sentence for word in sentence2: if word in stopwords: continue vector2[all_words.index(word)] += 1 return 1 - cosine_distance(vector1, vector2)
def get_sentence_similarity(s1, s2): stop_words = stopwords.words('english') s1 = [word.lower() for word in s1] s2 = [word.lower() for word in s2] all_words = list(set(s1 + s2)) v1 = [0] * len(all_words) v2 = [0] * len(all_words) # build the vector for the first sentence for w in s1: if w in stop_words: continue v1[all_words.index(w)] += 1 # build the vector for the second sentence for w in s2: if w in stop_words: continue v2[all_words.index(w)] += 1 return 1 - cosine_distance(v1, v2)
def sentence_similarity(sent1, sent2, stopwords=None): #use empty list if stopwords aren't present for given language if stopwords is None: stopwords = [] #convert the words to lowercase for nltk.stopwords[] sent1 = [w.lower() for w in sent1] sent2 = [w.lower() for w in sent2] #remove redundant words all_words = list(set(sent1 + sent2)) #initialize vector for sent1 and sent2 #a numpy array can also be used by making further alterations in the code vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # build the vector for the first sentence for w in sent1: #remove stopwords if w in stopwords: continue vector1[all_words.index(w)] += 1 # build the vector for the second sentence for w in sent2: #remove stopwords if w in stopwords: continue vector2[all_words.index(w)] += 1 #cosine distance calculated according to above formula return 1 - cosine_distance(vector1, vector2)
def findSimilarity(): import os import numpy as np from gensim import corpora list_of_files = [] for (dirpath, dirnames, filenames) in os.walk(categoryPath): for filename in filenames: target_file = os.path.join(dirpath, filename) list_of_files.append(target_file) for i in range(0, 50): ref = list_of_files[i] refDoc = preprocess(ref) refDict = buildDict(refDoc) for j in range(i + 1, 50, 1): candidate = list_of_files[j] candidateDoc = preprocess(candidate) candidateDict = buildDict(candidateDoc) combineDict = refDict.copy() combineDict.update(candidateDict) refWordList = getWordCountList(combineDict.keys(), refDict) candidateWordList = getWordCountList(combineDict.keys(), candidateDict) refArray = np.asarray(refWordList, dtype=int).reshape(-1) candidateArray = np.asarray(candidateWordList, dtype=int).reshape(-1) sim = cosine_distance(refArray, candidateArray) outputfile = getOutputFileName(ref, candidate) with open(outputfile, 'w') as writer: writer.write(str(np.asscalar(sim)))
def sentence_similarity(self, sent1, sent2, stopwords=None): if stopwords is None: stopwords = [] stemmer = PorterStemmer() sent1 = [stemmer.stem(w.lower()) for w in sent1] sent2 = [stemmer.stem(w.lower()) for w in sent2] all_words = list(set(sent1 + sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # build the vector for the first sentence for w in sent1: if w in stopwords: continue vector1[all_words.index(w)] += 1 # build the vector for the second sentence for w in sent2: if w in stopwords: continue vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2)
def sentence_similarity(self, sent1, sent2): ''' Calculate cosine similarity between two sentences ''' sent1 = sent1.split(' ') sent2 = sent2.split(' ') sent1 = [w.lower() for w in sent1] sent2 = [w.lower() for w in sent2] all_words = list(set(sent1 + sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # build the vector for the first sentence for w in sent1: vector1[all_words.index(w)] += 1 # build the vector for the second sentence for w in sent2: vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2)
def sent_similarity_calculation(s1, s2): # print(s1,s2) s1_tokens = nltk.word_tokenize(s1) s2_tokens = nltk.word_tokenize(s2) s1 = ' '.join( [w.lower() for w in s1_tokens if re.fullmatch(r'[a-zA-Z]*', w)]) s2 = ' '.join( [w.lower() for w in s2_tokens if re.fullmatch(r'[a-zA-Z]*', w)]) # tokenization s1_tokens = list(filter(remove_stopwords, nltk.word_tokenize(s1))) s2_tokens = list(filter(remove_stopwords, nltk.word_tokenize(s2))) # print(s1_tokens,s2_tokens) all_words = list(set(s1_tokens + s2_tokens)) #print(all_words) v1 = [0] * len(all_words) v2 = [0] * len(all_words) for x in s1_tokens: v1[all_words.index(x)] += 1 for x in s2_tokens: v2[all_words.index(x)] += 1 #print(v1,v2) return 1 - cosine_distance(v1, v2)
def sentence_similarity(sent1, sent2): wakati = MeCab.Tagger('-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') node1, node2 = wakati.parseToNode(sent1), wakati.parseToNode(sent2) sent1, sent2 = set(), set() # Exclude blanks and those with specific parts of speech(Adverbs, particles, conjunctions, auxiliary verbs) while node1: word = node1.surface hinshi = node1.feature.split(",")[0] if word == " " or hinshi in ["副詞", "助詞", "接続詞", "助動詞"]: node1 = node1.next continue sent1.add(word) node1 = node1.next while node2: word = node2.surface hinshi = node2.feature.split(",")[0] if word == " " or hinshi in ["副詞", "助詞", "接続詞", "助動詞"]: node2 = node2.next continue sent2.add(word) node2 = node2.next # Bag of words all_words = list(sent1 | sent2) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) for word in sent1: vector1[all_words.index(word)] += 1 for word in sent2: vector2[all_words.index(word)] += 1 # cosine similarity equals 1 - cosine distance return 1 - cosine_distance(vector1, vector2)
def sentence_similarity_calculator(self, vectorized_sentence_1, vectorized_sentence_2): return 1 - cosine_distance(vectorized_sentence_1, vectorized_sentence_2)
def similarity(s1, s2): s1_av = sentence_vector(tokenize_sentence(s1)) s2_av = sentence_vector(tokenize_sentence(s2)) return 1 - cosine_distance(s1_av, s2_av)
for w in lstWordsSent1: vecWordCount1[lstWordsInSents.index(w)] += 1 # build word count vector for the second sentence for w in lstWordsSent2: vecWordCount2[lstWordsInSents.index(w)] += 1 #print(vecWordCount1) #print(vecWordCount2) #print(type(vecWordCount1)) #print(type(vecWordCount2)) #print(vecWordCount1.shape) #print(vecWordCount2.shape) # cosine distance similarity_matrix[idx1][idx2] = 1 - cosine_distance(vecWordCount1, vecWordCount2) print(similarity_matrix[idx1][idx2]) #time.sleep(2) #print(similarity_matrix) #print(similarity_matrix.shape) #print(similarity_matrix.shape[0]) #print(similarity_matrix.shape[1]) #print(type(similarity_matrix)) #for idx1 in range(intListLength): # for idx2 in range(intListLength): # print(similarity_matrix[idx1][idx2]) #print(similarity_matrix[10][10])
if len(tweet_embeds) == 0: tweet_embeds = embeds else: tweet_embeds = np.vstack([tweet_embeds, embeds]) return tweet_embeds tweet_embeds = get_samples() print(tweet_embeds.shape) print(NUM_CLUSTERS) kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=1, rng=RNG) print('clustering...') assigned_clusters = kclusterer.cluster(tweet_embeds, assign_clusters=True) means = np.array(kclusterer.means()) print('calculating sum of distances...') sum_dists = [] for i, c in enumerate(assigned_clusters): sum_dists.append(cosine_distance(means[c], tweet_embeds[i])) print(np.mean(sum_dists)) # the smaller, the better print('saving...') np.save(OUTPUT_DIR + '/cluster_' + str(NUM_CLUSTERS) + '_means.npy', means)
def compute_text_similarity(text1, text2, text1tags, text2tags): """ Compute text similarity using cosine """ #stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form stemmer = nltk.stem.WordNetLemmatizer() sentences_text1 = split_sentences(text1) sentences_text2 = split_sentences(text2) tokens_text1 = [] tokens_text2 = [] #print("sentence 1",sentences_text1) #print("sentence 2",sentences_text2) for element in text1tags: tokens_text1.extend(split_into_tokens(element)) for element in text2tags: tokens_text2.extend(split_into_tokens(element)) for sentence in sentences_text1: tokenstemp = split_into_tokens(sentence.lower()) tokens_text1.extend(tokenstemp) for sentence in sentences_text2: tokenstemp = split_into_tokens(sentence.lower()) tokens_text2.extend(tokenstemp) if (len(text1tags) > 0): tokens_text1.extend(text1tags) if (len(text2tags) > 0): tokens_text2.extend(text2tags) tokens1Filtered = [ stemmer.lemmatize(x) for x in tokens_text1 if x not in stopWords ] tokens2Filtered = [ stemmer.lemmatize(x) for x in tokens_text2 if x not in stopWords ] # remove duplicate tokens tokens1Filtered = set(tokens1Filtered) tokens2Filtered = set(tokens2Filtered) print("final tokens1 ", tokens_text1) print("final tokens2 ", tokens_text2) tokensList = [] text1vector = [] text2vector = [] if len(tokens1Filtered) < len(tokens2Filtered): tokensList = tokens1Filtered else: tokensList = tokens2Filtered for token in tokensList: if token in tokens1Filtered: text1vector.append(1) else: text1vector.append(0) if token in tokens2Filtered: text2vector.append(1) else: text2vector.append(0) cosine_similarity = 1 - cosine_distance(text1vector, text2vector) if numpy.isnan(cosine_similarity): cosine_similarity = 0 return cosine_similarity
def dist_bw_sent_doc_cos(vec1,vec2): dist_arr={} for num,sent in vec1.items(): dist =cosine_distance(vec2,sent) dist_arr[num]=dist return dist_arr
def compute_text_similarity(text1, text2, text1tags, text2tags): """ Compute text similarity using cosine """ #stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form tokens_text1 = [] tokens_text2 = [] stemmer = nltk.stem.porter.PorterStemmer()#.WordNetLemmatizer() ''' sentences_text1 = split_sentences(text1) sentences_text2 = split_sentences(text2) #print("sentence 1",sentences_text1) #print("sentence 2",sentences_text2) #for tags in text1tags: #pass for element in text1tags: tokens_text1.extend(split_into_tokens(element)) for element in text2tags: tokens_text2.extend(split_into_tokens(element)) for sentence in sentences_text1: tokenstemp = split_into_tokens(sentence.lower()) tokens_text1.extend(tokenstemp) for sentence in sentences_text2: tokenstemp = split_into_tokens(sentence.lower()) tokens_text2.extend(tokenstemp) if (len(text1tags) > 0): tokens_text1.extend(text1tags) if (len(text2tags) > 0): tokens_text2.extend(text2tags) ''' for element in text1tags: tokens_text1.extend(split_into_tokens(element)) for element in text2tags: tokens_text2.extend(split_into_tokens(element)) tokens1Filtered = [stemmer.stem(x) for x in tokens_text1 if x not in stopWords] tokens2Filtered = [stemmer.stem(x) for x in tokens_text2 if x not in stopWords] # remove duplicate tokens tokens1Filtered = set(tokens1Filtered) tokens2Filtered = set(tokens2Filtered) tokensList=[] text1vector = [] text2vector = [] if len(tokens1Filtered) < len(tokens2Filtered): tokensList = tokens1Filtered else: tokensList = tokens2Filtered for token in tokensList: if token in tokens1Filtered: text1vector.append(1) else: text1vector.append(0) if token in tokens2Filtered: text2vector.append(1) else: text2vector.append(0) cosine_similarity = 1-cosine_distance(text1vector,text2vector) if numpy.isnan(cosine_similarity): cosine_similarity = 0 ''' with open(Path+"data/cosinesimilarity.txt","a") as fp: fp.write(str(tokens1Filtered)) fp.write("\n") fp.write(" -------vs--------- ") fp.write("\n") fp.write(str(tokens2Filtered)) fp.write("\n") fp.write(str(cosine_similarity)) fp.write("\n") fp.write("\n")''' return cosine_similarity
def sentence_similarity(sent1, sent2, method, stop_words): if method == "glove": full_vect_1 = [] full_vect_2 = [] for word in preprocess_sentence(sent1, stop_words): try: full_vect_1 += [model_glove[word]] except: print(word) for word in preprocess_sentence(sent2, stop_words): try: full_vect_2 += [model_glove[word]] except: print(word) vector_1 = np.mean(full_vect_1, axis=0) vector_2 = np.mean(full_vect_2, axis=0) return 1 - cosine_distance(vector_1, vector_2) elif method == "word2vec": full_vect_1 = [] full_vect_2 = [] for word in preprocess_sentence(sent1, stop_words): try: full_vect_1 += [model_word2vec[word]] except: print(word) for word in preprocess_sentence(sent2, stop_words): try: full_vect_2 += [model_word2vec[word]] except: print(word) vector_1 = np.mean(full_vect_1, axis=0) vector_2 = np.mean(full_vect_2, axis=0) return 1 - cosine_distance(vector_1, vector_2) elif method == 'countvectorizer': preprocessed_sent1 = preprocess_sentence(sent1, stop_words) preprocessed_sent2 = preprocess_sentence(sent2, stop_words) all_words = list(set(preprocessed_sent1 + preprocessed_sent2)) vector1 = [0] * len(all_words) vector2 = [0] * len(all_words) # build the vector for the first sentence (approach: count vectorizer) for w in preprocessed_sent1: vector1[all_words.index(w)] += 1 # build the vector for the second sentence for w in preprocessed_sent2: vector2[all_words.index(w)] += 1 return 1 - cosine_distance(vector1, vector2) elif method == 'tfidfvectorizer': preprocessed_sent1 = ' '.join(preprocess_sentence(sent1, stop_words)) preprocessed_sent2 = ' '.join(preprocess_sentence(sent2, stop_words)) tfidfvectorizer = TfidfVectorizer() X = tfidfvectorizer.fit_transform( [preprocessed_sent1, preprocessed_sent2]).toarray() # print(tfidfvectorizer.get_feature_names()) vector1 = X[0] vector2 = X[1] return 1 - cosine_distance(vector1, vector2)
def similar_sent(s1, s2): #Tính độ tương tự giữa 2 câu theo khoảng cách cosine return 1 - cosine_distance(s1, s2)
sentences = raw.splitlines() #get unique words unique = list(set(words)) #vectorize the sentences vectorized = [] n = len(sentences) for i in xrange(n): vector = [] for j in xrange(len(unique)): if unique[j] in sentences[i]: vector.append(1) else: vector.append(0) vectorized.append(vector) #create cosine similarity matrix dist = np.zeros(n**2).reshape((n, n)) for i in xrange(n): for j in xrange(i): dist[i][j] = cosine_distance(np.asarray(vectorized[i]), np.asarray(vectorized[j])) dist[j][i] = dist[i][j] #plot it fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(dist, interpolation='nearest') fig.colorbar(cax) plt.show()
def getDsim(similarities, pairWiseSimilarityMatrix, index): for i in range(0, pairWiseSimilarityMatrix.shape[0]): similarities[index][i] = 1 - cosine_distance( pairWiseSimilarityMatrix[index], pairWiseSimilarityMatrix[i])
def sentence_similarity(dfRow1,dfRow2): v1=dfRow1.values[0] v2=dfRow2.values[0] return 1-cosine_distance(v1,v2)
print("K-Medias: {0} s".format(time.time()- t)) pickle.dump(clusters, open("clusters_test.pickle", "wb")) else: clusters = pickle.load(open("clusters_test.pickle", "rb")) i=1 print("Porcentaje de grupos similares entre los documentos") for query in newsgroups[:10]: query_vector = text_vectorizer.transform([query]) query_cluster = clusters.predict(query_vector) documents = text_vector[np.where(query_cluster == clusters.labels_)[0], :] distances = np.array([[j, cosine_distance(documents[j].toarray()[0], query_vector.toarray()[0])] for j in range(documents.shape[0])]) distances = distances[distances[:,-1].argsort()][:5] groups = [newsgroups_obj.target[int(j[0])] for j in distances] groups, unique_counts = np.unique(groups, return_counts=True) percentages = [j / np.sum(unique_counts) for j in unique_counts] print("Documento {0}, Grupo mas frecuente: {1}-{2}%".format(i, groups[np.argmax(percentages)], np.amax(percentages)*100)) i += 1 i= 1 print("Porcentaje de grupos similares al de la consulta") for query in newsgroups[:10]: query_vector = text_vectorizer.transform([query])
def sentence_simmilarity(s1, s2): return 1 - cosine_distance(s1, s2)
def sentence_similarity(self, vector1, vector2): return abs(1 - cosine_distance(vector1, vector2))