def get_clusters(subgraphs, threshold, init_gohe=True):
    if init_gohe:
        _init_gohe(subgraphs)

    assigned_clusters = {}
    cluster_id = 0

    for graph_id in tqdm(GRAPHS, desc='Clustering'):
        if graph_id in assigned_clusters:
            continue
        all_sim = np.array([(key, cs(ENCODINGS[graph_id], ENCODINGS[key])[0][0])
                                for key in ENCODINGS],
                                dtype=[('key', int), ('sim', float)])
        all_sim = np.sort(all_sim, order='key')
        for i in range(len(all_sim)):
            if all_sim[i][1] > threshold:
                if i not in assigned_clusters:
                    assigned_clusters[i] = cluster_id
        cluster_id += 1

    clusters = {}
    for k, v in assigned_clusters.items():
        if v in clusters:
            clusters[v].append(k)
        else:
            clusters[v] = [k]

    return clusters
Example #2
0
def compare_to_optimum(df, optimum_dic):
    results = {}
    for instance in df[INSTANCE].unique():
        instance_subset = df[df[INSTANCE] == instance]
        optimal_value = optimum_dic[instance]

        algorithm_results = {}
        for algorithm in instance_subset[ALGORITHM].unique():
            algorithm_subset = instance_subset[instance_subset[ALGORITHM] ==
                                               algorithm]
            permutations = [[int(h) for h in x.split(' ')]
                            for x in algorithm_subset[PERMUTATION].to_list()]

            temp_results = []
            for permutation in permutations:
                temp_results.append(
                    cs(
                        np.asarray(permutation).reshape(1, -1),
                        np.asarray(optimal_value).reshape(1, -1)))

            algorithm_results[algorithm] = [x[0][0] for x in temp_results]
        results[instance] = algorithm_results

    pd.DataFrame.from_dict(results).to_csv('similarities_to_optimal.csv')
    plot(results, 'Similarity [%]', 'similarity_to_optimal.png')
    plot_best(results, 'Similarity [%]', 'similarity_to_optimal_best.png')
Example #3
0
def compare_solutions(df):
    results = {}
    for instance in df[INSTANCE].unique():
        instance_subset = df[df[INSTANCE] == instance]

        algorithm_results = {}
        for algorithm in instance_subset[ALGORITHM].unique():
            algorithm_subset = instance_subset[instance_subset[ALGORITHM] ==
                                               algorithm]
            permutations = [[int(h) for h in x.split(' ')]
                            for x in algorithm_subset[PERMUTATION].to_list()]

            temp_results = []
            for i in range(len(permutations)):
                permutation_1 = np.asarray(permutations[i]).reshape(1, -1)
                for j in range(len(permutations)):
                    if i != j:
                        permutation_2 = np.asarray(permutations[j]).reshape(
                            1, -1)
                        temp_results.append(cs(permutation_1, permutation_2))

            algorithm_results[algorithm] = [x[0][0] for x in temp_results]
            # print(temp_results)
        results[instance] = algorithm_results

    results_df = pd.DataFrame.from_dict(results)
    results_df.to_csv('solution_similarities.csv')

    plot(results, 'Similarity [%]', 'similarities.png')
Example #4
0
        def get_similarity_values(q1_csc, q2_csc):
            cosine_sim = []
            manhattan_dis = []
            eucledian_dis = []
            jaccard_dis = []
            minkowsk_dis = []

            for i, j in zip(q1_csc, q2_csc):
                sim = cs(i, j)
                cosine_sim.append(sim[0][0])
                sim = md(i, j)
                manhattan_dis.append(sim[0][0])
                sim = ed(i, j)
                eucledian_dis.append(sim[0][0])
                i_ = i.toarray()
                j_ = j.toarray()
                try:
                    sim = jsc(i_, j_)
                    jaccard_dis.append(sim)
                except:
                    jaccard_dis.append(0)

                sim = minkowski_dis.pairwise(i_, j_)
                minkowsk_dis.append(sim[0][0])
            return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
Example #5
0
def vectorToSentence(matrix, map, V):  #Array of word_embeddings
    s = ""
    dim = len(matrix)
    for w in V:
        cosine_list = list(np.asarray(cs(matrix, w)).flatten())
        s = s + " " + map[cosine_list.index(max(cosine_list))]
    return s
Example #6
0
def findSimilarSongs(songIDs):
    indices = []
    for ID in songIDs:
        indices.append(searchbyID(ID))

    sum = 0
    for index in indices:
        sum += x[index]

    meanvector = sum / len(indices)
    meanvector = np.array(meanvector).reshape(1, -1)

    similarity_vals = cs(x, meanvector)

    combined = []

    for i in range(len(similarity_vals)):
        if similarity_vals[i] > 0.9:
            combined.append((i, similarity_vals[i]))

    combined.sort(key=lambda x: x[1])

    if len(combined) > 12:
        combined = combined[:12]

    sim = [i for i, s in combined]

    similar_songs_IDs = []
    for song_id in full_data.iloc[sim]['id']:
        similar_songs_IDs.append(song_id)
    return similar_songs_IDs
def get_top_k_cosine_sim(graph_id, k):
    all_sim = np.array([(key, cs(ENCODINGS[graph_id], ENCODINGS[key])[0][0]) 
                        for key in tqdm(ENCODINGS, 
                                        desc="Calculating Cosine Simlarity for Graph " + 
                                             str(graph_id))],
                        dtype=[('key', int), ('sim', float)])
    all_sim = np.sort(all_sim, order='sim')[::-1]
    return all_sim[1:k + 1]
 def _create_weighted_distance_features(self, df):
     q1_matrix = self.tfidf_vectorizer.transform(
         df['spn_1'].values.tolist())
     q2_matrix = self.tfidf_vectorizer.transform(
         df['spn_2'].values.tolist())
     df['weighted_cosine_sim'] = np.concatenate([
         cs(q1_matrix[i], q2_matrix[i]).flatten()
         for i in range(q1_matrix.shape[0])
     ])
Example #9
0
 def predict_labels(self, embeddings):
     dis_cs = cs(embeddings, self.arr_embeddings)
     index_list = np.argmax(dis_cs, axis=-1)
     label_pred = []
     for i, index in enumerate(index_list):
         if dis_cs[i][index] > 0.6:
             label_pred.append(self.labels[index])
         else:
             label_pred.append("unknown")
     return label_pred
Example #10
0
def getFile(url, pid, spid, isview):
    lst = []
    export = Workbook()
    export_sheet = export.add_sheet('match')
    book = open_workbook(ur(filebaseURL + url)[0])
    if isinstance(book, Book):
        sheet = book.sheet_by_index(0)
        for i in range(sheet.nrows):
            lst.append(sheet.cell_value(i, 0))
        instance = TfidfVectorizer()
        matrix = instance.fit_transform(lst)
        cosine_matrix = cs(matrix, matrix)
        k = 0
        outer_arr = []
        for i in range(len(cosine_matrix)):
            fl = list(cosine_matrix[i])
            incr = 0
            n_lst = lst[:i] + lst[i + 1:]
            dic = {}
            for j in fl[:i] + fl[i + 1:]:
                if j * 100 > 80:
                    dic['string'] = lst[i]
                    dic['matched_with'] = n_lst[incr]
                    dic['percent'] = str(j * 100)[:6]
                    k += 1
                    outer_arr.append(dic)
                    print i, incr
                incr += 1
        if len(outer_arr) == 0:
            retval = pushBulk(lst, pid, spid)
            if retval == -1:
                return dumps({
                    "Reponse Code": "200",
                    "Response Message": "Unsuccessful.",
                    'Response Data': ''
                })
            else:
                return dumps({
                    'Response Code': 200,
                    'Response Message': 'Success',
                    'Response Data': retval
                })
        else:
            try:
                return dumps({
                    'Response Code': 200,
                    'Response Message': 'Success',
                    'Response Data in file': outer_arr
                })
            except:
                return dumps({
                    'Response Code': 500,
                    'Response Message': 'Unsuccessful',
                    'Response Data': []
                })
Example #11
0
 def score(self, fake_audio_features):
     total_score = 0.0
     for index_video, fake_audio_feature in enumerate(fake_audio_features):
         similarity = list()
         for index_audio, audio_feature in enumerate(self.audio_features):
             fake_audio_feature = np.array(fake_audio_feature).reshape(
                 1, -1)
             audio_feature = np.array(audio_feature).reshape(1, -1)
             sim = cs(fake_audio_feature, audio_feature).tolist()
             similarity += sim[0]
         ranking = np.argsort(similarity)
     return ranking.tolist().index(0)
Example #12
0
def symsearch():
    query = request.args.get("query")
    result = emb([query])
    possible = []
    trained_data = db.embedding.find()
    for value in trained_data:
        out = cs(result, [value["result"]])
        if out[0][0] >= 0.2:
            similar = {"text": value["text"], "similarity": out[0][0]}
            possible.append(similar)
    searchout = sorted(possible, key=itemgetter('similarity'), reverse=True)

    return jsonify(result=searchout)
Example #13
0
def cossim(doc1, doc2):
    from sklearn.metrics.pairwise import cosine_similarity as cs
    from sklearn.feature_extraction.text import CountVectorizer as cv

    x = [doc1, doc2]
    vectorizer = cv().fit_transform(x)
    vectors = vectorizer.toarray()

    a = vectors[0].reshape(1, -1)
    b = vectors[1].reshape(1, -1)

    similarity_score = cs(a, b)

    return similarity_score
Example #14
0
 def predict_labels(self, embeddings, embeddings_source, labels_index,
                    labels_name):
     dis_cs = cs(embeddings, embeddings_source)
     index_list = np.argmax(dis_cs, axis=-1)
     label_pred = []
     for i, index in enumerate(index_list):
         if dis_cs[i][index] > 0.6:
             label_index = labels_index[index]
             for i, (index_tmp, name_tmp) in enumerate(labels_name):
                 if label_index == index_tmp:
                     label_pred.append(labels_name[i])
         else:
             label_pred.append([-1, "unknown"])
     return label_pred
Example #15
0
 def score(self, fake_audio_features):
     total_score = 0.0
     for index_video, fake_audio_feature in enumerate(fake_audio_features):
         similarity = list()
         for index_audio, audio_feature in enumerate(self.audio_features):
             fake_audio_feature = np.array(fake_audio_feature).reshape(
                 1, -1)
             audio_feature = np.array(audio_feature).reshape(1, -1)
             sim = cs(fake_audio_feature, audio_feature).tolist()
             similarity += sim[0]
         ranking = np.argsort(similarity)
         total_score += float(ranking[index_video]) / float(
             self.dataset_length)
     return total_score / float(self.dataset_length)
Example #16
0
def get_similarity_values(res_csc, jd_csc):
    cosine_sim = []
    manhattan_dis = []
    eucledian_dis = []
    
    j= jd_csc
    for i in res_csc:
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
        sim = md(i,j)
        manhattan_dis.append(sim[0][0])
        sim = ed(i,j)
        eucledian_dis.append(sim[0][0])
        
    return cosine_sim, manhattan_dis, eucledian_dis  
Example #17
0
        def bm25_dist(row, dist_type, bm25_model, average_idf, feature_dim):
            assert dist_type in ['cs', 'ed', 'md'], 'dist type error'
            q1 = row['q1_w'].split()
            q2 = row['q2_w'].split()
            q1_bm25 = bm25_model.get_scores(q1, average_idf)
            q2_bm25 = bm25_model.get_scores(q2, average_idf)
            q1_bm25 = np.reshape(np.array(q1_bm25), (-1, feature_dim))
            q2_bm25 = np.reshape(np.array(q2_bm25), (-1, feature_dim))

            if dist_type == 'cs':
                score = cs(q1_bm25, q2_bm25).flatten()[0]
            elif dist_type == 'ed':
                score = ed(q1_bm25, q2_bm25).flatten()[0]
            elif dist_type == 'md':
                score = md(q1_bm25, q2_bm25).flatten()[0]
            return score
Example #18
0
def conversacion(respuesta):
    resRob = ''
    sent_token.append(respuesta)
    fv = tfd(tokenizer=LemNormalizacion)
    fid = fv.fit_transform(sent_token)
    valores = cs(fid[-1], fid)
    index = valores.argsort()[0][-2]
    flat = valores.flatten()
    flat.sort()
    request = flat[-2]
    if (request == 0):
        resRob += random.choice(Desentendido)
        return resRob
    else:
        resRob += sent_token[index]
        return resRob
Example #19
0
 def score_library(self, fake_audio_features):
     total_score = 0.0
     for index_video, fake_audio_feature in enumerate(fake_audio_features):
         similarity = list()
         #print (len(self.library_features))
         tmp_library_features = copy.deepcopy(self.library_features)
         tmp_library_features.append(self.audio_features[index_video])
         for index_audio, audio_feature in enumerate(tmp_library_features):
             fake_audio_feature = np.array(fake_audio_feature).reshape(
                 1, -1)
             audio_feature = np.array(audio_feature).reshape(1, -1)
             sim = cs(fake_audio_feature, audio_feature).tolist()
             similarity += sim[0]
         ranking = np.argsort(similarity)
         total_score += float(ranking[-1]) / float(self.library_length)
     return total_score / float(self.dataset_length)
Example #20
0
def cos_sim(filename, extracted, questions, tags):

    index, questions_matrix = get_question_matrix(filename, questions, tags)

    token_matrix = np.zeros((1, len(tags)))
    for token in extracted:
        j = index[token[0]]  # token.lemma
        token_matrix[0, j] = token[1]  # token.depth

    values = cs(token_matrix, questions_matrix)[0]

    position = np.argmax(values)

    # print(position, values[position], [t.text for t in question], [t.text for t in questions[position]])

    return values[position], position + 1
Example #21
0
    def __get_topic_sim(self, seg_list, dictionary, tfidf_model, model,
                        wordtopic_dic):
        sentcorpus = tfidf_model[dictionary.doc2bow(seg_list)]
        senttopic = model[sentcorpus]

        sim_dict = {}
        for word in seg_list:
            if word in wordtopic_dic:
                word_topic = wordtopic_dic[word]
                sim = cs([[item[1] for item in word_topic]],
                         [[item[1] for item in senttopic]])
                sim_dict[word] = sim[0][0]

        return [
            k for k, _ in sorted(
                sim_dict.items(), key=operator.itemgetter(1), reverse=True)
        ]
Example #22
0
 def score_library(self, fake_audio_features):
     total_score = 0.0
     for index_video, fake_audio_feature in enumerate(fake_audio_features):
         similarity = list()
         tmp_library_features = copy.deepcopy(self.library_features)
         tmp_library_features.append(self.audio_features[index_video])
         for index_audio, audio_feature in enumerate(tmp_library_features):
             fake_audio_feature = np.array(fake_audio_feature).reshape(
                 1, -1)
             audio_feature = np.array(audio_feature).reshape(1, -1)
             sim = cs(fake_audio_feature, audio_feature).tolist()
             similarity += sim[0]
         ranking = np.argsort(similarity)
         output = ranking.tolist().index(0)
     if output == 100:
         return ranking.tolist().index(1)
     else:
         return output
Example #23
0
    def get_cossim(self, sent1, sent2):
        """
        计算两个句子之间的余弦相似度。
        :param sent1: 句子1
        :param sent2: 句子2
        :return: 两个句子的余弦相似度
        """
        if isinstance(sent1, str):
            sent1 = self.cleaner(sent1,
                                 stopwords=self.stopwords,
                                 specialwords=self.specialwords,
                                 remove_alphas=self.remove_alphas,
                                 remove_numbers=self.remove_numbers,
                                 remove_urls=self.remove_urls,
                                 remove_punctuation=self.remove_punctuation,
                                 remove_email=self.remove_email,
                                 remove_ip_address=self.remove_ip_address,
                                 keep_chinese_only=self.keep_chinese_only)

            seg_sent1 = [" ".join(self.seg(sent1, pos=False))]
        else:
            raise ValueError('Please input a str format sentence (´▽`)ノ ')
        if isinstance(sent2, str):
            sent2 = self.cleaner(sent2,
                                 stopwords=self.stopwords,
                                 specialwords=self.specialwords,
                                 remove_alphas=self.remove_alphas,
                                 remove_numbers=self.remove_numbers,
                                 remove_urls=self.remove_urls,
                                 remove_punctuation=self.remove_punctuation,
                                 remove_email=self.remove_email,
                                 remove_ip_address=self.remove_ip_address,
                                 keep_chinese_only=self.keep_chinese_only)

            seg_sent2 = [" ".join(self.seg(sent2, pos=False))]
        else:
            raise ValueError('Please input a str format sentence (´▽`)ノ ')
        if self.tfidf_vectorizer is None:
            raise ValueError("Please build tfidf_vectorizer with corpus...")
        s1_matrix = self.tfidf_vectorizer.transform(seg_sent1)
        s2_matrix = self.tfidf_vectorizer.transform(seg_sent2)
        return cs(s1_matrix, s2_matrix).flatten()[0]
Example #24
0
def cosine_similarity(p, q, transpose_p=False, transpose_q=False):
    """
    Computes the cosine similarity of two d-dimensional matrices

    :param p: d-dimensional vector (np.ndarray) of shape (p_samples, d)
    :param q: d-dimensional vector (np.ndarray) of shape (q_samples, d)
    :param transpose_p: whether to transpose p or not
    :param transpose_q: whether to transpose q or not

    :return
        - cosine similarity matrix S of shape (p_samples, q_samples)
          where S[i, j] = s(p[i], q[j])
    """

    # If it is a vector, consider it as a single sample matrix
    if len(p.shape) == 1:
        p = p.reshape(1, -1)
    if len(q.shape) == 1:
        q = q.reshape(1, -1)

    # cosine similarity: sum(pi,qi)/(sqrt(sum(a^2))*sqrt(sum(a^2)))
    '''if transpose_p:
      p = np.transpose(p)
    if transpose_q:
      q = np.transpose(q)    
    '''
    '''
    matrix = scipy.sparse.lil_matrix((p.shape[0], q.shape[0]))

    for i, pi in enumerate(p):
      for j, qj in enumerate(q):
        n = sum([a*b for a,b in zip(pi,qj)])
        d1 = sqrt(sum(np.array(list(map(lambda x: x*x, pi)))))
        d2 = sqrt(sum(np.array(list(map(lambda x: x*x, qj)))))

        matrix[i,j] = n/(d1*d2)
    '''
    matrix = cs(p, q)

    return matrix
Example #25
0
	def document_cluster(self,entity_dict):
		vector_sample = []
		for j in entity_dict:
			str_ = entity_dict[j]
			if len(str_) > 1:
				vector_sample.append(str_)
		split_list = list(map(generate_ngram, vector_sample))
		abbv_list = list(map(abbv, vector_sample))

		tfidf_vecorizor = TfidfVectorizer(stop_words=[])
		split_list_tf_idf = tfidf_vecorizor.fit_transform(split_list)

		pw = cs(split_list_tf_idf, split_list_tf_idf)
		edge_set = set()
		node_set = set()

		for ii in range(pw.shape[0]):
			node_set.add(ii)
			for j in range(ii, pw.shape[0]):
				if pw[ii][j] > 0.5:
					edge_set.add((ii, j))
				elif pw[ii][j] > 0.3 and abbv_list[ii] == abbv_list[j]:
					edge_set.add((ii, j))
		G = nx.Graph()
		for ii in node_set:
			G.add_node(ii, attribute=vector_sample[ii])
		G.add_edges_from(list(edge_set))
		cp = sorted(nx.connected_components(G), key=len, reverse=True)

		inner_cluster = []
		for j in range(len(cp)):
			clu = []
			for n in cp[j]:
				clu.append(vector_sample[n])
			inner_cluster.append(clu)
		return inner_cluster
Example #26
0
cols = df.columns.values
cols1 = df1.columns.values
# df3 =pd.read_csv("cor3.csv",encoding='utf-8')
# for i in xrange(len(df3.index.values)):
# 	feature1 = df3.iloc[i,0].strip()
# 	feature2 = df3.iloc[i,1].strip()
# 	# print [feature1,feature2]
# 	if (feature1 in cols and feature2 in cols1) or (feature1 in cols1 and feature2 in cols):
# 		print str(feature1)+  '    &   '+  str(feature2)+'  &  '+str(round(df3.iloc[i,2],2)) +'  \\\\ \\hline'
features_edu = [i for i in xrange(1,len(cols))]
features_health = [i for i in xrange(1,len(cols1))]
for i in features_health:	
	for j in features_edu:
		if cols1[i] in cols:
			# print cols1[i]
			continue		
		lis1 = np.array([df1[cols1[i]].values])	
		lis2 = np.array([df[cols[j]].values])		
		dic[(i,j)]= cs(lis1,lis2)[0][0]
lis =  sorted(dic.keys(),key=lambda x:dic[x])[::-1]
# for i in features_health:
# 	for j in features_health:
# 		if i<j:
# 			lis1 = np.array([df1[cols1[i]].values])
# 			lis2 = np.array([df1[cols1[j]].values])
# 			dic[(i,j)]= cs(lis1,lis2)[0][0]
# lis =  sorted(dic.keys(),key=lambda x:abs(dic[x]))[::-1]
for i in lis:	if abs(dic[i])>0.98:
		if cols1[i[0]].split('_')[0]!=cols[i[1]].split('_')[0] and re.sub('[0-9]','',cols1[i[0]])!=re.sub('[0-9]','',cols[i[1]]):
					print str(cols1[i[0]])+  '    &   '+  str(cols[i[1]])+'  &  '+str(round(dic[i],3)) +'  \\\\ \\hline'
Example #27
0
	def global_connected_component(self):
		min_hash_table = {}
		G_all = nx.Graph()
		node_set_all = set()
		edge_set_all = set()
		node_id = 0
		id_to_node = {}
		for doc in self.inner_cluster:
			for clu in  self.inner_cluster[doc]:
				node_set_doc = []
				max_count = 0
				mem = None
				node_id_ = None
				for ent in clu:
					node_set_all.add(node_id)
					node_set_doc.append(node_id)
					id_to_node[node_id]= (ent,doc)
					count = self.clean_count[doc][ent]
					if count > max_count:
						if mem and len(mem)<3:
							continue
						max_count = count
						mem = ent
						node_id_ = node_id
					node_id += 1
				for i in range(len(node_set_doc)-1):
					edge_set_all.add((node_set_doc[i],node_set_doc[i+1]))
				if mem:
					hash_code = getminHash(mem,1)*100 + getminHash(mem,0)
					if hash_code not in min_hash_table:
						min_hash_table[hash_code] = []
					min_hash_table[hash_code].append((mem,doc,node_id_))
		# print(len(min_hash_table))
		for h in min_hash_table:
			hash_node = {}
			for n in min_hash_table[h]:
				word = n[0]
				if word not in hash_node:
					hash_node[word] = []
				hash_node[word].append(n[2])
			check_cluster = list(set(map(lambda x:x[0],min_hash_table[h])))

			split_list = map(generate_ngram, check_cluster)
			tfidf_vecorizor = TfidfVectorizer(stop_words=[])
			split_list_tf_idf = tfidf_vecorizor.fit_transform(split_list)
			pw = cs(split_list_tf_idf, split_list_tf_idf)
			edge_set = set()
			node_set = set()

			for ii in range(pw.shape[0]):
				node_set.add(ii)
				for j in range(ii, pw.shape[0]):
					if pw[ii][j] > 0.5:
						edge_set.add((ii, j))
			G = nx.Graph()
			for ii in node_set:
				G.add_node(ii, attribute=check_cluster[ii])
			G.add_edges_from(list(edge_set))
			cp = sorted(nx.connected_components(G), key=len, reverse=True)
			for j in range(len(cp)):
				clu_node = []
				for n in cp[j]:
					for nn in hash_node[check_cluster[n]]:
						clu_node.append(nn)
				for nod_no in range(len(clu_node)-1):
					edge_set_all.add((clu_node[nod_no], clu_node[nod_no + 1]))

		G_all = nx.Graph()
		G_all.add_edges_from(list(edge_set_all))
		cp = sorted(nx.connected_components(G_all), key=len, reverse=True)
		for clu_node in cp:

			name_count = {}
			mention = []
			for ii in clu_node:
				ent_tuple = id_to_node[ii]
				name = ent_tuple[0]
				doc = ent_tuple[1]
				if name not in name_count:
					name_count[name] = 0
				name_count[name] += self.clean_count[doc][name]
				for ori_mem in self.clean_to_unclean[doc][name]:
					mention.append({"mention":ori_mem,"doc":doc})
			enitiy = {"mention": mention, "name":sorted(name_count.items(),key=lambda x:x[1],reverse=True)[0][0]}
			self.res.append(enitiy)
Example #28
0
    def extract_tfidf_feature(self, df):
        q1_w_vec = self.tfidf_vectorizer.transform(df['q1_w'].values.tolist())
        q2_w_vec = self.tfidf_vectorizer.transform(df['q2_w'].values.tolist())

        df['tfidf_cs'] = np.concatenate([
            cs(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])
        df['tfidf_ed'] = np.concatenate([
            ed(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])
        df['tfidf_md'] = np.concatenate([
            md(q1_w_vec[i], q2_w_vec[i]).flatten()
            for i in range(q1_w_vec.shape[0])
        ])

        corpus_tfidf = np.concatenate(
            [q1_w_vec.toarray(), q2_w_vec.toarray()], axis=0)

        svd_model = TruncatedSVD(n_components=5)
        svd_model.fit(corpus_tfidf)

        svd_topic = svd_model.transform(corpus_tfidf)
        q1_w_svd_feature = svd_topic[:q1_w_vec.shape[0]]
        q2_w_svd_feature = svd_topic[q1_w_vec.shape[0]:]

        df['svd_cs'] = np.concatenate([
            cs(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])
        df['svd_ed'] = np.concatenate([
            ed(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])
        df['svd_md'] = np.concatenate([
            md(q1_w_svd_feature[i].reshape(-1, 5),
               q2_w_svd_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_svd_feature.shape[0])
        ])

        lda_model = LatentDirichletAllocation(n_components=5, random_state=0)
        lda_model.fit(corpus_tfidf)

        lda_topic = lda_model.transform(corpus_tfidf)

        q1_w_lda_feature = lda_topic[:q1_w_vec.shape[0]]
        q2_w_lda_feature = lda_topic[q1_w_vec.shape[0]:]

        df['lda_cs'] = np.concatenate([
            cs(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
        df['lda_ed'] = np.concatenate([
            ed(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
        df['lda_md'] = np.concatenate([
            md(q1_w_lda_feature[i].reshape(-1, 5),
               q2_w_lda_feature[i].reshape(-1, 5)).flatten()
            for i in range(q1_w_lda_feature.shape[0])
        ])
# Import the necessary packages for preforming similarity between texts.
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.feature_extraction.text import TfidfVectorizer as tv

# Load the texts - The original & The generated
gensis = open('../Genesis.txt','r').read().split('\r')
model_gensis = open('../Model_Genesis.txt','r').read().split('\n')

# Initialize
TfV = tv()
TfV.fit(gensis)
Y = TfV.transform(gensis)

# Check for every sentence the similarity
similaritySum = 0
for sentence in model_gensis:
    X = TfV.transform([sentence])
    print(sentence)
    print(gensis[cs(X, Y).argmax()])
    print(' ')
    similaritySum = cs(X, Y).max()

# Calculate the similarity
similarity = similaritySum/7
print('The similarity between the original text - Genesis -  and the model is: ' , similarity)
Example #30
0
    movies_file.set_index('show_id', drop=False)
    maj_list = []
    min_list = []
    genre = movies_file['listed_in']
    genre_list = list(genre)
    genre_list_new = []
    #modify the genre column by replacing
    for i in genre_list:
        i = i.replace(',', ' ')
        genre_list_new.append(i)

    count_matrix = cv.fit_transform(genre_list_new)
    count_matrix_array = count_matrix.toarray()
    #limits##################################
    init_movieval = 0
    final_movieval = 1960 + 1
    #######################################
    #compare 2 at a time(to increase number of movies that can be used) , use the movie if cs_value > 0.8
    #disrcard others
    for i in range(1, 1960):
        for j in range(init_movieval, final_movieval):
            similarity_scores = cs(
                [count_matrix_array[i], count_matrix_array[j]])
            #discard unnecessary movies
            #print(similarity_scores)
            if similarity_scores[0][1] > 0.82:
                tu = (j, similarity_scores[0][1])
                min_list.append(tu)
        maj_list.append(min_list)
        min_list = []
    a.writerows(maj_list)
def weat_analysis(embedding,
                  bias_weat_combinations,
                  sets,
                  steps=-1,
                  print_weat=False,
                  matrices_check=True):
    """
	For given embedding, WEAT tests combinations generate experimental WEAT results.

	Parameters
	----------
	embedding: Embedding | instance of class Embedding.
	bias_weat_combinations: dict | structure which contains subclass/target and attribute sets.
	sets: dict | structure containing all attribute and subclass/target set of words.
	steps: int | scalar representing number of iterations for generating p value (If -1, all combinations are being taken).
	print_weat: bool | Whether to print results or not.
	matrices_check: bool | Whether to compute cosine similarity values between sets.

	Returns
	-------
	final_values: dict | Effect sizes and p values for all WEAT tests
	bias_levels_d: dict | Bias levels for each class respectively
	d_values: list | List of all WEAT test effect sizes
	p_values: list | List of all WEAT test p values
	cs_matrix: dict | For each WEAT test generate mutual target-attibute sets matrix of cosine similarity values between all existing words
	"""

    final_values = {}
    p_values, d_values = [], []
    cs_matrix = {}

    #used category notation here instead of class notation (category = class)
    for category in bias_weat_combinations:

        final_values[category] = []
        d_values_category, p_values_category = [], []

        for category_target_pair in bias_weat_combinations[category]:
            for attribute_pair in bias_weat_combinations[category][
                    category_target_pair]:
                p, d = WEAT(embedding, sets[category_target_pair[0]],
                            sets[category_target_pair[1]],
                            sets[attribute_pair[0]], sets[attribute_pair[1]],
                            steps).get_stats()

                if (matrices_check == True):
                    a1t1 = cs([
                        embedding.get_value(word)
                        for word in sets[attribute_pair[0]]
                    ], [
                        embedding.get_value(word)
                        for word in sets[category_target_pair[0]]
                    ])
                    a2t1 = cs([
                        embedding.get_value(word)
                        for word in sets[attribute_pair[1]]
                    ], [
                        embedding.get_value(word)
                        for word in sets[category_target_pair[0]]
                    ])
                    a1t2 = cs([
                        embedding.get_value(word)
                        for word in sets[attribute_pair[0]]
                    ], [
                        embedding.get_value(word)
                        for word in sets[category_target_pair[1]]
                    ])
                    a2t2 = cs([
                        embedding.get_value(word)
                        for word in sets[attribute_pair[1]]
                    ], [
                        embedding.get_value(word)
                        for word in sets[category_target_pair[1]]
                    ])
                    cs_matrix[(category_target_pair[0],
                               category_target_pair[1], attribute_pair[0],
                               attribute_pair[1])] = np.array([[a1t1, a1t2],
                                                               [a2t1, a2t2]])

                if print_weat == True:
                    if (np.abs(d) > 0.7):
                        csm = cs_matrix[(category_target_pair[0],
                                         category_target_pair[1],
                                         attribute_pair[0], attribute_pair[1])]
                        cs_res = np.array(
                            [[np.mean(csm[0, 0]),
                              np.mean(csm[0, 1])],
                             [np.mean(csm[1, 0]),
                              np.mean(csm[1, 1])]])
                        print(
                            f'\nBIAS: {attribute_pair[0]}, {attribute_pair[1]}, {category_target_pair[0]}, {category_target_pair[1]} : {p} ||| {"%.4f" % d} \n{cs_res}\n'
                        )
                    else:
                        print(
                            f'{attribute_pair[0]}, {attribute_pair[1]}, {category_target_pair[0]}, {category_target_pair[1]} : {p} ||| {"%.4f" % d}'
                        )
                final_values[category].append([
                    category_target_pair[0], category_target_pair[1],
                    attribute_pair[0], attribute_pair[1], p, d
                ])

                p_values.append(p)
                d_values.append(d)
                p_values_category.append(p)
                d_values_category.append(np.abs(d) / 2)

    bias_levels_d = average_bias_value(final_values)

    return final_values, dict(
        sorted(bias_levels_d.items(), key=lambda x: x[1],
               reverse=True)), d_values, p_values, cs_matrix