def get_clusters(subgraphs, threshold, init_gohe=True): if init_gohe: _init_gohe(subgraphs) assigned_clusters = {} cluster_id = 0 for graph_id in tqdm(GRAPHS, desc='Clustering'): if graph_id in assigned_clusters: continue all_sim = np.array([(key, cs(ENCODINGS[graph_id], ENCODINGS[key])[0][0]) for key in ENCODINGS], dtype=[('key', int), ('sim', float)]) all_sim = np.sort(all_sim, order='key') for i in range(len(all_sim)): if all_sim[i][1] > threshold: if i not in assigned_clusters: assigned_clusters[i] = cluster_id cluster_id += 1 clusters = {} for k, v in assigned_clusters.items(): if v in clusters: clusters[v].append(k) else: clusters[v] = [k] return clusters
def compare_to_optimum(df, optimum_dic): results = {} for instance in df[INSTANCE].unique(): instance_subset = df[df[INSTANCE] == instance] optimal_value = optimum_dic[instance] algorithm_results = {} for algorithm in instance_subset[ALGORITHM].unique(): algorithm_subset = instance_subset[instance_subset[ALGORITHM] == algorithm] permutations = [[int(h) for h in x.split(' ')] for x in algorithm_subset[PERMUTATION].to_list()] temp_results = [] for permutation in permutations: temp_results.append( cs( np.asarray(permutation).reshape(1, -1), np.asarray(optimal_value).reshape(1, -1))) algorithm_results[algorithm] = [x[0][0] for x in temp_results] results[instance] = algorithm_results pd.DataFrame.from_dict(results).to_csv('similarities_to_optimal.csv') plot(results, 'Similarity [%]', 'similarity_to_optimal.png') plot_best(results, 'Similarity [%]', 'similarity_to_optimal_best.png')
def compare_solutions(df): results = {} for instance in df[INSTANCE].unique(): instance_subset = df[df[INSTANCE] == instance] algorithm_results = {} for algorithm in instance_subset[ALGORITHM].unique(): algorithm_subset = instance_subset[instance_subset[ALGORITHM] == algorithm] permutations = [[int(h) for h in x.split(' ')] for x in algorithm_subset[PERMUTATION].to_list()] temp_results = [] for i in range(len(permutations)): permutation_1 = np.asarray(permutations[i]).reshape(1, -1) for j in range(len(permutations)): if i != j: permutation_2 = np.asarray(permutations[j]).reshape( 1, -1) temp_results.append(cs(permutation_1, permutation_2)) algorithm_results[algorithm] = [x[0][0] for x in temp_results] # print(temp_results) results[instance] = algorithm_results results_df = pd.DataFrame.from_dict(results) results_df.to_csv('solution_similarities.csv') plot(results, 'Similarity [%]', 'similarities.png')
def get_similarity_values(q1_csc, q2_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] jaccard_dis = [] minkowsk_dis = [] for i, j in zip(q1_csc, q2_csc): sim = cs(i, j) cosine_sim.append(sim[0][0]) sim = md(i, j) manhattan_dis.append(sim[0][0]) sim = ed(i, j) eucledian_dis.append(sim[0][0]) i_ = i.toarray() j_ = j.toarray() try: sim = jsc(i_, j_) jaccard_dis.append(sim) except: jaccard_dis.append(0) sim = minkowski_dis.pairwise(i_, j_) minkowsk_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
def vectorToSentence(matrix, map, V): #Array of word_embeddings s = "" dim = len(matrix) for w in V: cosine_list = list(np.asarray(cs(matrix, w)).flatten()) s = s + " " + map[cosine_list.index(max(cosine_list))] return s
def findSimilarSongs(songIDs): indices = [] for ID in songIDs: indices.append(searchbyID(ID)) sum = 0 for index in indices: sum += x[index] meanvector = sum / len(indices) meanvector = np.array(meanvector).reshape(1, -1) similarity_vals = cs(x, meanvector) combined = [] for i in range(len(similarity_vals)): if similarity_vals[i] > 0.9: combined.append((i, similarity_vals[i])) combined.sort(key=lambda x: x[1]) if len(combined) > 12: combined = combined[:12] sim = [i for i, s in combined] similar_songs_IDs = [] for song_id in full_data.iloc[sim]['id']: similar_songs_IDs.append(song_id) return similar_songs_IDs
def get_top_k_cosine_sim(graph_id, k): all_sim = np.array([(key, cs(ENCODINGS[graph_id], ENCODINGS[key])[0][0]) for key in tqdm(ENCODINGS, desc="Calculating Cosine Simlarity for Graph " + str(graph_id))], dtype=[('key', int), ('sim', float)]) all_sim = np.sort(all_sim, order='sim')[::-1] return all_sim[1:k + 1]
def _create_weighted_distance_features(self, df): q1_matrix = self.tfidf_vectorizer.transform( df['spn_1'].values.tolist()) q2_matrix = self.tfidf_vectorizer.transform( df['spn_2'].values.tolist()) df['weighted_cosine_sim'] = np.concatenate([ cs(q1_matrix[i], q2_matrix[i]).flatten() for i in range(q1_matrix.shape[0]) ])
def predict_labels(self, embeddings): dis_cs = cs(embeddings, self.arr_embeddings) index_list = np.argmax(dis_cs, axis=-1) label_pred = [] for i, index in enumerate(index_list): if dis_cs[i][index] > 0.6: label_pred.append(self.labels[index]) else: label_pred.append("unknown") return label_pred
def getFile(url, pid, spid, isview): lst = [] export = Workbook() export_sheet = export.add_sheet('match') book = open_workbook(ur(filebaseURL + url)[0]) if isinstance(book, Book): sheet = book.sheet_by_index(0) for i in range(sheet.nrows): lst.append(sheet.cell_value(i, 0)) instance = TfidfVectorizer() matrix = instance.fit_transform(lst) cosine_matrix = cs(matrix, matrix) k = 0 outer_arr = [] for i in range(len(cosine_matrix)): fl = list(cosine_matrix[i]) incr = 0 n_lst = lst[:i] + lst[i + 1:] dic = {} for j in fl[:i] + fl[i + 1:]: if j * 100 > 80: dic['string'] = lst[i] dic['matched_with'] = n_lst[incr] dic['percent'] = str(j * 100)[:6] k += 1 outer_arr.append(dic) print i, incr incr += 1 if len(outer_arr) == 0: retval = pushBulk(lst, pid, spid) if retval == -1: return dumps({ "Reponse Code": "200", "Response Message": "Unsuccessful.", 'Response Data': '' }) else: return dumps({ 'Response Code': 200, 'Response Message': 'Success', 'Response Data': retval }) else: try: return dumps({ 'Response Code': 200, 'Response Message': 'Success', 'Response Data in file': outer_arr }) except: return dumps({ 'Response Code': 500, 'Response Message': 'Unsuccessful', 'Response Data': [] })
def score(self, fake_audio_features): total_score = 0.0 for index_video, fake_audio_feature in enumerate(fake_audio_features): similarity = list() for index_audio, audio_feature in enumerate(self.audio_features): fake_audio_feature = np.array(fake_audio_feature).reshape( 1, -1) audio_feature = np.array(audio_feature).reshape(1, -1) sim = cs(fake_audio_feature, audio_feature).tolist() similarity += sim[0] ranking = np.argsort(similarity) return ranking.tolist().index(0)
def symsearch(): query = request.args.get("query") result = emb([query]) possible = [] trained_data = db.embedding.find() for value in trained_data: out = cs(result, [value["result"]]) if out[0][0] >= 0.2: similar = {"text": value["text"], "similarity": out[0][0]} possible.append(similar) searchout = sorted(possible, key=itemgetter('similarity'), reverse=True) return jsonify(result=searchout)
def cossim(doc1, doc2): from sklearn.metrics.pairwise import cosine_similarity as cs from sklearn.feature_extraction.text import CountVectorizer as cv x = [doc1, doc2] vectorizer = cv().fit_transform(x) vectors = vectorizer.toarray() a = vectors[0].reshape(1, -1) b = vectors[1].reshape(1, -1) similarity_score = cs(a, b) return similarity_score
def predict_labels(self, embeddings, embeddings_source, labels_index, labels_name): dis_cs = cs(embeddings, embeddings_source) index_list = np.argmax(dis_cs, axis=-1) label_pred = [] for i, index in enumerate(index_list): if dis_cs[i][index] > 0.6: label_index = labels_index[index] for i, (index_tmp, name_tmp) in enumerate(labels_name): if label_index == index_tmp: label_pred.append(labels_name[i]) else: label_pred.append([-1, "unknown"]) return label_pred
def score(self, fake_audio_features): total_score = 0.0 for index_video, fake_audio_feature in enumerate(fake_audio_features): similarity = list() for index_audio, audio_feature in enumerate(self.audio_features): fake_audio_feature = np.array(fake_audio_feature).reshape( 1, -1) audio_feature = np.array(audio_feature).reshape(1, -1) sim = cs(fake_audio_feature, audio_feature).tolist() similarity += sim[0] ranking = np.argsort(similarity) total_score += float(ranking[index_video]) / float( self.dataset_length) return total_score / float(self.dataset_length)
def get_similarity_values(res_csc, jd_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] j= jd_csc for i in res_csc: sim = cs(i,j) cosine_sim.append(sim[0][0]) sim = md(i,j) manhattan_dis.append(sim[0][0]) sim = ed(i,j) eucledian_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis
def bm25_dist(row, dist_type, bm25_model, average_idf, feature_dim): assert dist_type in ['cs', 'ed', 'md'], 'dist type error' q1 = row['q1_w'].split() q2 = row['q2_w'].split() q1_bm25 = bm25_model.get_scores(q1, average_idf) q2_bm25 = bm25_model.get_scores(q2, average_idf) q1_bm25 = np.reshape(np.array(q1_bm25), (-1, feature_dim)) q2_bm25 = np.reshape(np.array(q2_bm25), (-1, feature_dim)) if dist_type == 'cs': score = cs(q1_bm25, q2_bm25).flatten()[0] elif dist_type == 'ed': score = ed(q1_bm25, q2_bm25).flatten()[0] elif dist_type == 'md': score = md(q1_bm25, q2_bm25).flatten()[0] return score
def conversacion(respuesta): resRob = '' sent_token.append(respuesta) fv = tfd(tokenizer=LemNormalizacion) fid = fv.fit_transform(sent_token) valores = cs(fid[-1], fid) index = valores.argsort()[0][-2] flat = valores.flatten() flat.sort() request = flat[-2] if (request == 0): resRob += random.choice(Desentendido) return resRob else: resRob += sent_token[index] return resRob
def score_library(self, fake_audio_features): total_score = 0.0 for index_video, fake_audio_feature in enumerate(fake_audio_features): similarity = list() #print (len(self.library_features)) tmp_library_features = copy.deepcopy(self.library_features) tmp_library_features.append(self.audio_features[index_video]) for index_audio, audio_feature in enumerate(tmp_library_features): fake_audio_feature = np.array(fake_audio_feature).reshape( 1, -1) audio_feature = np.array(audio_feature).reshape(1, -1) sim = cs(fake_audio_feature, audio_feature).tolist() similarity += sim[0] ranking = np.argsort(similarity) total_score += float(ranking[-1]) / float(self.library_length) return total_score / float(self.dataset_length)
def cos_sim(filename, extracted, questions, tags): index, questions_matrix = get_question_matrix(filename, questions, tags) token_matrix = np.zeros((1, len(tags))) for token in extracted: j = index[token[0]] # token.lemma token_matrix[0, j] = token[1] # token.depth values = cs(token_matrix, questions_matrix)[0] position = np.argmax(values) # print(position, values[position], [t.text for t in question], [t.text for t in questions[position]]) return values[position], position + 1
def __get_topic_sim(self, seg_list, dictionary, tfidf_model, model, wordtopic_dic): sentcorpus = tfidf_model[dictionary.doc2bow(seg_list)] senttopic = model[sentcorpus] sim_dict = {} for word in seg_list: if word in wordtopic_dic: word_topic = wordtopic_dic[word] sim = cs([[item[1] for item in word_topic]], [[item[1] for item in senttopic]]) sim_dict[word] = sim[0][0] return [ k for k, _ in sorted( sim_dict.items(), key=operator.itemgetter(1), reverse=True) ]
def score_library(self, fake_audio_features): total_score = 0.0 for index_video, fake_audio_feature in enumerate(fake_audio_features): similarity = list() tmp_library_features = copy.deepcopy(self.library_features) tmp_library_features.append(self.audio_features[index_video]) for index_audio, audio_feature in enumerate(tmp_library_features): fake_audio_feature = np.array(fake_audio_feature).reshape( 1, -1) audio_feature = np.array(audio_feature).reshape(1, -1) sim = cs(fake_audio_feature, audio_feature).tolist() similarity += sim[0] ranking = np.argsort(similarity) output = ranking.tolist().index(0) if output == 100: return ranking.tolist().index(1) else: return output
def get_cossim(self, sent1, sent2): """ 计算两个句子之间的余弦相似度。 :param sent1: 句子1 :param sent2: 句子2 :return: 两个句子的余弦相似度 """ if isinstance(sent1, str): sent1 = self.cleaner(sent1, stopwords=self.stopwords, specialwords=self.specialwords, remove_alphas=self.remove_alphas, remove_numbers=self.remove_numbers, remove_urls=self.remove_urls, remove_punctuation=self.remove_punctuation, remove_email=self.remove_email, remove_ip_address=self.remove_ip_address, keep_chinese_only=self.keep_chinese_only) seg_sent1 = [" ".join(self.seg(sent1, pos=False))] else: raise ValueError('Please input a str format sentence (´▽`)ノ ') if isinstance(sent2, str): sent2 = self.cleaner(sent2, stopwords=self.stopwords, specialwords=self.specialwords, remove_alphas=self.remove_alphas, remove_numbers=self.remove_numbers, remove_urls=self.remove_urls, remove_punctuation=self.remove_punctuation, remove_email=self.remove_email, remove_ip_address=self.remove_ip_address, keep_chinese_only=self.keep_chinese_only) seg_sent2 = [" ".join(self.seg(sent2, pos=False))] else: raise ValueError('Please input a str format sentence (´▽`)ノ ') if self.tfidf_vectorizer is None: raise ValueError("Please build tfidf_vectorizer with corpus...") s1_matrix = self.tfidf_vectorizer.transform(seg_sent1) s2_matrix = self.tfidf_vectorizer.transform(seg_sent2) return cs(s1_matrix, s2_matrix).flatten()[0]
def cosine_similarity(p, q, transpose_p=False, transpose_q=False): """ Computes the cosine similarity of two d-dimensional matrices :param p: d-dimensional vector (np.ndarray) of shape (p_samples, d) :param q: d-dimensional vector (np.ndarray) of shape (q_samples, d) :param transpose_p: whether to transpose p or not :param transpose_q: whether to transpose q or not :return - cosine similarity matrix S of shape (p_samples, q_samples) where S[i, j] = s(p[i], q[j]) """ # If it is a vector, consider it as a single sample matrix if len(p.shape) == 1: p = p.reshape(1, -1) if len(q.shape) == 1: q = q.reshape(1, -1) # cosine similarity: sum(pi,qi)/(sqrt(sum(a^2))*sqrt(sum(a^2))) '''if transpose_p: p = np.transpose(p) if transpose_q: q = np.transpose(q) ''' ''' matrix = scipy.sparse.lil_matrix((p.shape[0], q.shape[0])) for i, pi in enumerate(p): for j, qj in enumerate(q): n = sum([a*b for a,b in zip(pi,qj)]) d1 = sqrt(sum(np.array(list(map(lambda x: x*x, pi))))) d2 = sqrt(sum(np.array(list(map(lambda x: x*x, qj))))) matrix[i,j] = n/(d1*d2) ''' matrix = cs(p, q) return matrix
def document_cluster(self,entity_dict): vector_sample = [] for j in entity_dict: str_ = entity_dict[j] if len(str_) > 1: vector_sample.append(str_) split_list = list(map(generate_ngram, vector_sample)) abbv_list = list(map(abbv, vector_sample)) tfidf_vecorizor = TfidfVectorizer(stop_words=[]) split_list_tf_idf = tfidf_vecorizor.fit_transform(split_list) pw = cs(split_list_tf_idf, split_list_tf_idf) edge_set = set() node_set = set() for ii in range(pw.shape[0]): node_set.add(ii) for j in range(ii, pw.shape[0]): if pw[ii][j] > 0.5: edge_set.add((ii, j)) elif pw[ii][j] > 0.3 and abbv_list[ii] == abbv_list[j]: edge_set.add((ii, j)) G = nx.Graph() for ii in node_set: G.add_node(ii, attribute=vector_sample[ii]) G.add_edges_from(list(edge_set)) cp = sorted(nx.connected_components(G), key=len, reverse=True) inner_cluster = [] for j in range(len(cp)): clu = [] for n in cp[j]: clu.append(vector_sample[n]) inner_cluster.append(clu) return inner_cluster
cols = df.columns.values cols1 = df1.columns.values # df3 =pd.read_csv("cor3.csv",encoding='utf-8') # for i in xrange(len(df3.index.values)): # feature1 = df3.iloc[i,0].strip() # feature2 = df3.iloc[i,1].strip() # # print [feature1,feature2] # if (feature1 in cols and feature2 in cols1) or (feature1 in cols1 and feature2 in cols): # print str(feature1)+ ' & '+ str(feature2)+' & '+str(round(df3.iloc[i,2],2)) +' \\\\ \\hline' features_edu = [i for i in xrange(1,len(cols))] features_health = [i for i in xrange(1,len(cols1))] for i in features_health: for j in features_edu: if cols1[i] in cols: # print cols1[i] continue lis1 = np.array([df1[cols1[i]].values]) lis2 = np.array([df[cols[j]].values]) dic[(i,j)]= cs(lis1,lis2)[0][0] lis = sorted(dic.keys(),key=lambda x:dic[x])[::-1] # for i in features_health: # for j in features_health: # if i<j: # lis1 = np.array([df1[cols1[i]].values]) # lis2 = np.array([df1[cols1[j]].values]) # dic[(i,j)]= cs(lis1,lis2)[0][0] # lis = sorted(dic.keys(),key=lambda x:abs(dic[x]))[::-1] for i in lis: if abs(dic[i])>0.98: if cols1[i[0]].split('_')[0]!=cols[i[1]].split('_')[0] and re.sub('[0-9]','',cols1[i[0]])!=re.sub('[0-9]','',cols[i[1]]): print str(cols1[i[0]])+ ' & '+ str(cols[i[1]])+' & '+str(round(dic[i],3)) +' \\\\ \\hline'
def global_connected_component(self): min_hash_table = {} G_all = nx.Graph() node_set_all = set() edge_set_all = set() node_id = 0 id_to_node = {} for doc in self.inner_cluster: for clu in self.inner_cluster[doc]: node_set_doc = [] max_count = 0 mem = None node_id_ = None for ent in clu: node_set_all.add(node_id) node_set_doc.append(node_id) id_to_node[node_id]= (ent,doc) count = self.clean_count[doc][ent] if count > max_count: if mem and len(mem)<3: continue max_count = count mem = ent node_id_ = node_id node_id += 1 for i in range(len(node_set_doc)-1): edge_set_all.add((node_set_doc[i],node_set_doc[i+1])) if mem: hash_code = getminHash(mem,1)*100 + getminHash(mem,0) if hash_code not in min_hash_table: min_hash_table[hash_code] = [] min_hash_table[hash_code].append((mem,doc,node_id_)) # print(len(min_hash_table)) for h in min_hash_table: hash_node = {} for n in min_hash_table[h]: word = n[0] if word not in hash_node: hash_node[word] = [] hash_node[word].append(n[2]) check_cluster = list(set(map(lambda x:x[0],min_hash_table[h]))) split_list = map(generate_ngram, check_cluster) tfidf_vecorizor = TfidfVectorizer(stop_words=[]) split_list_tf_idf = tfidf_vecorizor.fit_transform(split_list) pw = cs(split_list_tf_idf, split_list_tf_idf) edge_set = set() node_set = set() for ii in range(pw.shape[0]): node_set.add(ii) for j in range(ii, pw.shape[0]): if pw[ii][j] > 0.5: edge_set.add((ii, j)) G = nx.Graph() for ii in node_set: G.add_node(ii, attribute=check_cluster[ii]) G.add_edges_from(list(edge_set)) cp = sorted(nx.connected_components(G), key=len, reverse=True) for j in range(len(cp)): clu_node = [] for n in cp[j]: for nn in hash_node[check_cluster[n]]: clu_node.append(nn) for nod_no in range(len(clu_node)-1): edge_set_all.add((clu_node[nod_no], clu_node[nod_no + 1])) G_all = nx.Graph() G_all.add_edges_from(list(edge_set_all)) cp = sorted(nx.connected_components(G_all), key=len, reverse=True) for clu_node in cp: name_count = {} mention = [] for ii in clu_node: ent_tuple = id_to_node[ii] name = ent_tuple[0] doc = ent_tuple[1] if name not in name_count: name_count[name] = 0 name_count[name] += self.clean_count[doc][name] for ori_mem in self.clean_to_unclean[doc][name]: mention.append({"mention":ori_mem,"doc":doc}) enitiy = {"mention": mention, "name":sorted(name_count.items(),key=lambda x:x[1],reverse=True)[0][0]} self.res.append(enitiy)
def extract_tfidf_feature(self, df): q1_w_vec = self.tfidf_vectorizer.transform(df['q1_w'].values.tolist()) q2_w_vec = self.tfidf_vectorizer.transform(df['q2_w'].values.tolist()) df['tfidf_cs'] = np.concatenate([ cs(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) df['tfidf_ed'] = np.concatenate([ ed(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) df['tfidf_md'] = np.concatenate([ md(q1_w_vec[i], q2_w_vec[i]).flatten() for i in range(q1_w_vec.shape[0]) ]) corpus_tfidf = np.concatenate( [q1_w_vec.toarray(), q2_w_vec.toarray()], axis=0) svd_model = TruncatedSVD(n_components=5) svd_model.fit(corpus_tfidf) svd_topic = svd_model.transform(corpus_tfidf) q1_w_svd_feature = svd_topic[:q1_w_vec.shape[0]] q2_w_svd_feature = svd_topic[q1_w_vec.shape[0]:] df['svd_cs'] = np.concatenate([ cs(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) df['svd_ed'] = np.concatenate([ ed(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) df['svd_md'] = np.concatenate([ md(q1_w_svd_feature[i].reshape(-1, 5), q2_w_svd_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_svd_feature.shape[0]) ]) lda_model = LatentDirichletAllocation(n_components=5, random_state=0) lda_model.fit(corpus_tfidf) lda_topic = lda_model.transform(corpus_tfidf) q1_w_lda_feature = lda_topic[:q1_w_vec.shape[0]] q2_w_lda_feature = lda_topic[q1_w_vec.shape[0]:] df['lda_cs'] = np.concatenate([ cs(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ]) df['lda_ed'] = np.concatenate([ ed(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ]) df['lda_md'] = np.concatenate([ md(q1_w_lda_feature[i].reshape(-1, 5), q2_w_lda_feature[i].reshape(-1, 5)).flatten() for i in range(q1_w_lda_feature.shape[0]) ])
# Import the necessary packages for preforming similarity between texts. from sklearn.metrics.pairwise import cosine_similarity as cs from sklearn.feature_extraction.text import TfidfVectorizer as tv # Load the texts - The original & The generated gensis = open('../Genesis.txt','r').read().split('\r') model_gensis = open('../Model_Genesis.txt','r').read().split('\n') # Initialize TfV = tv() TfV.fit(gensis) Y = TfV.transform(gensis) # Check for every sentence the similarity similaritySum = 0 for sentence in model_gensis: X = TfV.transform([sentence]) print(sentence) print(gensis[cs(X, Y).argmax()]) print(' ') similaritySum = cs(X, Y).max() # Calculate the similarity similarity = similaritySum/7 print('The similarity between the original text - Genesis - and the model is: ' , similarity)
movies_file.set_index('show_id', drop=False) maj_list = [] min_list = [] genre = movies_file['listed_in'] genre_list = list(genre) genre_list_new = [] #modify the genre column by replacing for i in genre_list: i = i.replace(',', ' ') genre_list_new.append(i) count_matrix = cv.fit_transform(genre_list_new) count_matrix_array = count_matrix.toarray() #limits################################## init_movieval = 0 final_movieval = 1960 + 1 ####################################### #compare 2 at a time(to increase number of movies that can be used) , use the movie if cs_value > 0.8 #disrcard others for i in range(1, 1960): for j in range(init_movieval, final_movieval): similarity_scores = cs( [count_matrix_array[i], count_matrix_array[j]]) #discard unnecessary movies #print(similarity_scores) if similarity_scores[0][1] > 0.82: tu = (j, similarity_scores[0][1]) min_list.append(tu) maj_list.append(min_list) min_list = [] a.writerows(maj_list)
def weat_analysis(embedding, bias_weat_combinations, sets, steps=-1, print_weat=False, matrices_check=True): """ For given embedding, WEAT tests combinations generate experimental WEAT results. Parameters ---------- embedding: Embedding | instance of class Embedding. bias_weat_combinations: dict | structure which contains subclass/target and attribute sets. sets: dict | structure containing all attribute and subclass/target set of words. steps: int | scalar representing number of iterations for generating p value (If -1, all combinations are being taken). print_weat: bool | Whether to print results or not. matrices_check: bool | Whether to compute cosine similarity values between sets. Returns ------- final_values: dict | Effect sizes and p values for all WEAT tests bias_levels_d: dict | Bias levels for each class respectively d_values: list | List of all WEAT test effect sizes p_values: list | List of all WEAT test p values cs_matrix: dict | For each WEAT test generate mutual target-attibute sets matrix of cosine similarity values between all existing words """ final_values = {} p_values, d_values = [], [] cs_matrix = {} #used category notation here instead of class notation (category = class) for category in bias_weat_combinations: final_values[category] = [] d_values_category, p_values_category = [], [] for category_target_pair in bias_weat_combinations[category]: for attribute_pair in bias_weat_combinations[category][ category_target_pair]: p, d = WEAT(embedding, sets[category_target_pair[0]], sets[category_target_pair[1]], sets[attribute_pair[0]], sets[attribute_pair[1]], steps).get_stats() if (matrices_check == True): a1t1 = cs([ embedding.get_value(word) for word in sets[attribute_pair[0]] ], [ embedding.get_value(word) for word in sets[category_target_pair[0]] ]) a2t1 = cs([ embedding.get_value(word) for word in sets[attribute_pair[1]] ], [ embedding.get_value(word) for word in sets[category_target_pair[0]] ]) a1t2 = cs([ embedding.get_value(word) for word in sets[attribute_pair[0]] ], [ embedding.get_value(word) for word in sets[category_target_pair[1]] ]) a2t2 = cs([ embedding.get_value(word) for word in sets[attribute_pair[1]] ], [ embedding.get_value(word) for word in sets[category_target_pair[1]] ]) cs_matrix[(category_target_pair[0], category_target_pair[1], attribute_pair[0], attribute_pair[1])] = np.array([[a1t1, a1t2], [a2t1, a2t2]]) if print_weat == True: if (np.abs(d) > 0.7): csm = cs_matrix[(category_target_pair[0], category_target_pair[1], attribute_pair[0], attribute_pair[1])] cs_res = np.array( [[np.mean(csm[0, 0]), np.mean(csm[0, 1])], [np.mean(csm[1, 0]), np.mean(csm[1, 1])]]) print( f'\nBIAS: {attribute_pair[0]}, {attribute_pair[1]}, {category_target_pair[0]}, {category_target_pair[1]} : {p} ||| {"%.4f" % d} \n{cs_res}\n' ) else: print( f'{attribute_pair[0]}, {attribute_pair[1]}, {category_target_pair[0]}, {category_target_pair[1]} : {p} ||| {"%.4f" % d}' ) final_values[category].append([ category_target_pair[0], category_target_pair[1], attribute_pair[0], attribute_pair[1], p, d ]) p_values.append(p) d_values.append(d) p_values_category.append(p) d_values_category.append(np.abs(d) / 2) bias_levels_d = average_bias_value(final_values) return final_values, dict( sorted(bias_levels_d.items(), key=lambda x: x[1], reverse=True)), d_values, p_values, cs_matrix