コード例 #1
0
    def calculate_similarity(self, pair_key, lines):
        '''
        Sum components of each corating pair across all users who rated both
        item x and item y, then calculate pairwise pearson similarity and
        corating counts.  The similarities are normalized to the [0,1] scale
        because we do a numerical sort.

        19,21   0.4,2,[user1, user2, ...]
        21,19   0.4,2,[user1, user2, ...]
        19,70   0.6,1,[user1, user2, ...]
        70,19   0.6,1,[user1, user2, ...]
        21,70   0.1,1,[user1, user2, ...]
        70,21   0.1,1,[user1, user2, ...]
        '''
        sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
        item_pair, co_ratings_id = pair_key, lines
        item_xname, item_yname = item_pair
        for item_x, item_y, user_id in lines:
            sum_xx += item_x * item_x
            sum_yy += item_y * item_y
            sum_xy += item_x * item_y
            sum_y += item_y
            sum_x += item_x
            n += 1

        cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy))

        yield (item_xname, item_yname), (cos_sim, n)
コード例 #2
0
def topicProj(topicMatrix, threshold=0):
    G = nx.Graph()
    G.clear()
    # ----- Add nodes
    # print('\nAdding nodes for graph...')
    for topic, _ in topicMatrix.iteritems():
        # Extract all text from the actor
        G.add_node(topic)
    # print('All nodes successfully added!')

    # ----- Add edges
    # print('\nAdding edges for graph...')
    for i, topic_i in enumerate(topicMatrix.columns):
        for j, topic_j in enumerate(topicMatrix.columns[(i + 1):]):
            similarity = simi.cosine(topicMatrix[topic_i],
                                     topicMatrix[topic_j])
            if similarity > threshold:
                G.add_edge(topic_i, topic_j, weight=similarity)
    # print('All edges successfully added!')

    cent = [G.degree(topic, weight='weight') for topic in topicMatrix.columns]
    cent = pd.DataFrame(cent,
                        columns='edgeWeightSum'.split(),
                        index=topicMatrix.columns)
    # Normalise by the max number of other node a node can be connected to
    cent['Degree'] = cent['edgeWeightSum'] / (len(topicMatrix.columns) - 1)

    # Add centrality information to node attribute
    nx.set_node_attributes(G, cent['Degree'], 'degreeCent')

    return G, cent
コード例 #3
0
def getItemRecommendation(matrix,item):
	totals = {}
	simSums = {}

	
	for other in matrix:
		if other == item : continue
		sim = similarity.cosine(matrix,item,other)
		#sim = similarity.pearson(matrix,item,other)
	
		if sim <= 0 : continue

		for user in matrix[other]:
			if user not in matrix[item] or matrix[item][user] == 0:
				totals.setdefault(user,0)
				totals[user] += matrix[other][user]*sim
				simSums.setdefault(user,0)
				simSums[user] += sim

		
		#itemEval=[(total/simSums[user],user) for user,total in totals.items()]

	itemEval={user:total/simSums[user] for user,total in totals.items()}
		

		#print str(itemEval)
	return itemEval
コード例 #4
0
ファイル: vecsum.py プロジェクト: EthanJamesLew/pugspy
def basesdecomp(bases, vec):
    best = float('inf')
    bestans = []
    cosineval = float('inf')
    for i in range(0, 1000):
        ans = getDecomp(bases, vec)
        try:
            cosineval = similarity.stddev([similarity.cosine(x, y) for x,y in zip(ans,bases)])
        except TypeError:
            None
        if cosineval < best:
            best = cosineval
            project = [similarity.cosine(x, y) for x,y in zip(ans,bases)]
            bestans = ans
    for i in range(1, len(ans)):
        sum1 = [x + y for x, y in zip(bases[i], bases[i-1])]
    fidelity = similarity.cosine(sum1, vec)
    return best, bestans, fidelity, project
コード例 #5
0
 def test_cosine(self):
   G = self.G
   cos = cosine(G)
   ns.assert_equal(len(cos), 7)
   for i in range(7):
     assert(i in cos)
   for i in self.G.cos_sim.keys():
     ns.assert_equal(len(self.G.cos_sim[i]), len(cos[i]))
     for j in self.G.cos_sim[i].keys():
       ns.assert_almost_equal(cos[i][j], self.G.cos_sim[i][j], places=4)
コード例 #6
0
def average_similarity(products, features):
	sample_size = 500
	sample = products.sample(sample_size).index.values
	
	sim = 0
	
	for i1 in sample:
		for i2 in sample:
			if i1 != i2:
				sim += similarity.cosine(features, products.loc[i1, :], products.loc[i2, :])
	return sim / (sample_size * (sample_size - 1))
コード例 #7
0
def average_similarity(products, features):
    sample_size = 500
    sample = products.sample(sample_size).index.values

    sim = 0

    for i1 in sample:
        for i2 in sample:
            if i1 != i2:
                sim += similarity.cosine(features, products.loc[i1, :],
                                         products.loc[i2, :])
    return sim / (sample_size * (sample_size - 1))
コード例 #8
0
def get_similarity(products, features, p1, all_articles):
    sim = 0
    no_others = len(all_articles) - 1

    for id in all_articles:
        # If product in product database, calculate similarity.
        if id in products['article_id'].values:
            p2 = products[products['article_id'] == id]

            if p1['article_id'].values != p2['article_id'].values:
                sim += similarity.cosine(features, p1.iloc[0], p2.iloc[0])
        # Else decrease amount of product used for average with one.
        else:
            no_others -= 1

    if no_others > 0:
        return sim / float(no_others)
    else:
        return 0
コード例 #9
0
def get_similarity(products, features):
	# No. of products
	m = len(products)
	# Save article numbers on first line
	article_ids = products['article_id'].values
	
	# Similarity is calculated per product and appended to similarity file.
	# The similarity matrix can not be made at once because of memory constraints.
	for i in range(m):
		p1 = article_ids[i]
	
		# Write header to file, overwriting the file if necessary
		save.array("Similarity/" + str(p1), ["article_id", "similarity"], "w+")
	
		for j in range(m):
			if i != j:
				p2 = article_ids[j]
				sim = similarity.cosine(features, products.loc[i, :], products.loc[j, :])
				save.array("Similarity/" + str(p1), [p2, sim])
コード例 #10
0
def get_similarity(products, features, p1, all_articles):
	sim = 0
	no_others = len(all_articles) - 1
	
	for id in all_articles:
		# If product in product database, calculate similarity.
		if id in products['article_id'].values:
			p2 = products[products['article_id'] == id]
		
			if p1['article_id'].values != p2['article_id'].values:
				sim += similarity.cosine(features, p1.iloc[0], p2.iloc[0])
		# Else decrease amount of product used for average with one.
		else:
			no_others -= 1
	
	if no_others > 0:
		return sim / float(no_others)
	else:
		return 0
コード例 #11
0
def get_similarity(products, features):
    # No. of products
    m = len(products)
    # Save article numbers on first line
    article_ids = products['article_id'].values

    # Similarity is calculated per product and appended to similarity file.
    # The similarity matrix can not be made at once because of memory constraints.
    for i in range(m):
        p1 = article_ids[i]

        # Write header to file, overwriting the file if necessary
        save.array("Similarity/" + str(p1), ["article_id", "similarity"], "w+")

        for j in range(m):
            if i != j:
                p2 = article_ids[j]
                sim = similarity.cosine(features, products.loc[i, :],
                                        products.loc[j, :])
                save.array("Similarity/" + str(p1), [p2, sim])
コード例 #12
0
ファイル: attacks.py プロジェクト: agoetschm/master-thesis
def guess_from_l(sketch, freq, k, ref_subset, candidate_subset, unknown_node, dist_mat_knownledge=1):
    dist_candidate_subset = oracle.query_subset(sketch, candidate_subset, ref_subset)

    # only consider a partial distance matrix
    partial_mask = np.around(np.random.binomial(n=1, p=dist_mat_knownledge, size=(len(candidate_subset), len(ref_subset))))
    partial_dist = np.multiply(partial_mask, dist_candidate_subset)
    # set hidden values to avg
    avg = np.sum(partial_dist, axis=(0,1))/np.sum(partial_mask, axis=(0,1))
    partial_dist = partial_dist + np.multiply(avg, (np.ones((len(candidate_subset), len(ref_subset))) - partial_mask))

    # distance estimate vector
    dist_est = sketch_pattern.distance_estimate_subset(sketch, freq, k/2, np.array([unknown_node]), ref_subset)

    # inverse value and calculate similarity
    inv_partial_dist = np.power(partial_dist, -1)
    inv_dist_est = np.power(dist_est, -1)
    sim = similarity.cosine(inv_partial_dist, inv_dist_est)

    # check if the recovery succeded
    guessed_right = (candidate_subset[np.argmax(sim)] == unknown_node)
    return guessed_right
コード例 #13
0
def getUserRecommendation(matrix,user):
	totals = {}
	simSums = {}
	
	u_average = sum(matrix[user][item] for item in matrix[user])/len(matrix[user])

	for other in matrix:
		if other == user : continue
		sim = similarity.cosine(matrix,user,other)
		#sim = similarity.pearson(matrix,user,other)
		if sim <= 0 : continue

		o_average = sum(matrix[other][item] for item in matrix[other])/len(matrix[other])

		for item in matrix[other]:
			if item not in matrix[user] or matrix[user][item] == 0:
				totals.setdefault(item,0)
				totals[item] += (matrix[other][item] - o_average)*sim
				simSums.setdefault(item,0)
				simSums[item] += sim

		#userEval=[(u_average + totals/simSums[item],item) for item,total in totals.items()]
	userEval={item:u_average+total/simSums[item] for item,total in totals.items()}
	return userEval
コード例 #14
0
  katz, lhn, rss2, dice, inverse_log_weighted, rsa
import pickle
import pandas

### VERTEX SIMILARITY

graphs = pickle.load(open('data/graphs_networkx.pkl','rb'))
results = dict()

for graph_type,G in graphs.iteritems():
    print "Calculating for %s" %(graph_type)
    results[graph_type] = dict()
    print "ASCOS ---------------------------"
    results[graph_type]["ascos"] = ascos(G)
    print "COSINE ---------------------------"
    results[graph_type]["cosine"] = cosine(G)
    print "JACCARD --------------------------"
    results[graph_type]["jaccard"] = jaccard(G)
    print "KATZ -----------------------------"
    results[graph_type]["katz"] = katz(G)
    print "LHN ------------------------------"
    results[graph_type]["lhn"] = lhn(G)
    print "RSS2 -----------------------------"
    results[graph_type]["rss2"] = rss2(G)
    print "DICE -----------------------------"
    results[graph_type]["dice"] = dice(G)
    print "INVERSE LOG WEIGHTED --------------"
    results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G)

pickle.dump(results,open("data/sim_metrics.pkl","wb"))
コード例 #15
0
__author__ = 'John'

import pandas as pd
import numpy as np
import re
import cf
import similarity
import filter_demo_data
cosine = similarity.cosine()
similarity_helper = cosine
import sys

def read_input(filename, top_N=10): #just reads in X and category matrix so loading it will not take time
    shop_mall = pd.read_csv("Demographic Filtering/mall_store_list.csv", encoding = "ISO-8859-1", index_col=False) #This is used for obttaining unique malls

    stores_db = shop_mall[["store", "store_id"]].drop_duplicates(subset = ["store"]) #get unique stores
    stores_db.index = pd.Series(np.arange(stores_db.shape[0]))
    #stores_db.to_csv("command_line_files/store_list.csv", header="true")

    mall_demographic = pd.read_csv("Demographic Filtering/mall_with_demographic_category.csv", encoding = "ISO-8859-1", index_col=False)
    county_db = mall_demographic[["county", "usps"]].copy(deep=True) #reads all
    county_db["county"] =county_db["county"].str.lower()
    county_db["usps"] =county_db["usps"].str.lower()
    county_db =county_db.drop_duplicates(["county", "usps"])
    #county_db.to_csv("command_line_files/county_list.csv", header="true")
    #save columns into another file

    #read txt file
    file = open(filename)
    entire_file = file.read()
    user = re.split('\n+', entire_file) #first entry of user is their county followed by stores
コード例 #16
0
ファイル: main.py プロジェクト: ichimunemasa/python
#-*- coding: utf-8 -*-
import similarity as sim

if __name__=="__main__":

	
	#辞書
	dictionary = {
	"A":{"apple":2.5,"mikan":2.5,"banana":5.0,"melon":2.0},
	"B":{"apple":2.5,"banana":1.5,"kiui":3.0,"melon":4.0}
	}

	#ユークリッド距離
	value = sim.euclidean(dictionary,"A","B")
	print str(value)
	
	#ピアソン相関
	value = sim.pearson(dictionary,"A","B")
	print str(value)
	
	#コサイン類似度
	value = sim.cosine(dictionary,"A","B")
	print str(value)






コード例 #17
0
        start_time = time.time()
        print('Creating the Query Vector...', end='', flush=True)
        query_vec = qp.query_vector(invf_indexes, len(invf_terms))
        print(f'done. ({time.time()-start_time}sec)')

        if should_do_rocchio:
            # Select Relevant Documents
            relevant_postings = []
            # Similarity
            if method == 'dot':
                sim = similarity.dot(VS, query_vec, invf_indexes,
                                     merged_postings)
            else:
                sim = similarity.cosine(VS,
                                        query_vec,
                                        invf_indexes,
                                        merged_postings,
                                        hybrid=True,
                                        power=2)
            # Ranking
            rank = []
            for i in range(len(sim)):
                rank.append((sim[i], merged_postings[i]))
            rank.sort(reverse=True)
            if num_relevant_docs > len(rank):
                num_relevant_docs = len(rank)
            for i in range(num_relevant_docs):
                relevant_postings.append(rank[i][1])

            start_time = time.time()
            print('Rocchio...', end='', flush=True)
            qp.rocchio(VS,
コード例 #18
0
  katz, lhn, rss2, dice, inverse_log_weighted, rsa
import pickle
import pandas

### VERTEX SIMILARITY

graphs = pickle.load(open('data/graphs_networkx.pkl', 'rb'))
results = dict()

for graph_type, G in graphs.iteritems():
    print "Calculating for %s" % (graph_type)
    results[graph_type] = dict()
    print "ASCOS ---------------------------"
    results[graph_type]["ascos"] = ascos(G)
    print "COSINE ---------------------------"
    results[graph_type]["cosine"] = cosine(G)
    print "JACCARD --------------------------"
    results[graph_type]["jaccard"] = jaccard(G)
    print "KATZ -----------------------------"
    results[graph_type]["katz"] = katz(G)
    print "LHN ------------------------------"
    results[graph_type]["lhn"] = lhn(G)
    print "RSS2 -----------------------------"
    results[graph_type]["rss2"] = rss2(G)
    print "DICE -----------------------------"
    results[graph_type]["dice"] = dice(G)
    print "INVERSE LOG WEIGHTED --------------"
    results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G)

pickle.dump(results, open("data/sim_metrics.pkl", "wb"))
コード例 #19
0
def actorProj(actorProfile, topicMatrix, threshold=0):
    # ================================================================================
    # ----- Error Checking
    assert (actorProfile.index).equals(
        topicMatrix.index
    ), "actorProfile and topicMatrix do not have the same indicies."
    # ================================================================================
    # ----- Construct Weighted Graph
    G = nx.Graph()
    G.clear()

    # ----- Add nodes
    # print('\nAdding nodes for graph...')
    for actor, row in actorProfile.iterrows():
        # Extract all text from the actor
        G.add_node(
            actor,
            gender=row['Gender'],
            party=row['Party'],
            metro=row['Metro'],
            elec=row['Elec'],
            wordCount=row['WordCount'],
        )
    # print('All nodes successfully added!')

    # ----- Add edges
    # print('\nAdding edges for graph...')
    for i, (a_i, row_i) in enumerate(actorProfile.iterrows()):
        for j, (a_j, row_j) in enumerate(actorProfile[(i + 1):].iterrows()):
            # Retrieve and parse topicVector profile of actor_i and actor_j
            tv_i = topicMatrix.loc[a_i].tolist()
            tv_j = topicMatrix.loc[a_j].tolist()
            similarity = simi.cosine(tv_i, tv_j)
            # Both actors cannot be the same person and similarity is above a threshold
            if a_i != a_j and similarity > threshold:
                # Add edge with appropriate attributes
                G.add_edge(
                    a_i,
                    a_j,
                    weight=similarity,
                    closeness=abs(1 - similarity),
                )
                # Log progress...
                # logPath = "jaccardSim"
                # kf.log(f"{i:{5}},{j:{5}} of {len(actorProfile):{5}}", logPath)
                # kf.log(f"{a_i:{30}}{a_j}", logPath)
                # kf.log(f"{row_i['Topics']:{30}}{row_i['Topics']}", logPath)
                # kf.log(f"Jaccard Similarity: {similarity}\n", logPath)
    # print('All edges successfully added!')

    # ================================================================================
    # ----- Compute degree of centrality and add as node attribute
    # G.degree() returns the number of edges adjacent to a node, taking into account of the edge weight
    cent = pd.DataFrame(
        [G.degree(actor, weight='weight') for actor in actorProfile.index],
        columns='edgeWeightSum'.split(),
        index=actorProfile.index)
    # Normalise by the max number of other node a node can be connected to
    cent['Degree'] = cent['edgeWeightSum'] / (len(actorProfile) - 1)
    cent['Betweenness'] = pd.Series(nx.betweenness_centrality(G))
    # cent['Betweenness'] = pd.Series(nx.betweenness_centrality(G, weight='weight'))
    # cent['Betweenness'] = pd.Series(nx.betweenness_centrality(G, weight='closeness'))
    cent['Closeness'] = pd.Series(nx.closeness_centrality(G))
    # cent['Closeness'] = pd.Series(nx.closeness_centrality(G, distance='weight'))
    # cent['Closeness'] = pd.Series(nx.closeness_centrality(G, distance='closeness'))

    # Add centrality information to node attribute
    nx.set_node_attributes(G, cent['Degree'], 'DegreeCent')
    nx.set_node_attributes(G, cent['Betweenness'], 'BtwnCent')
    nx.set_node_attributes(G, cent['Closeness'], 'ClosenessCent')

    # Find list of connected componenets
    connected_components = list(nx.connected_components(G))

    # Concat centrality measures to actorprofile
    actorProfile_out = copy(actorProfile)
    actorProfile_out['DegreeCentrality'] = cent['Degree']
    actorProfile_out['BtwnCentrality'] = cent['Betweenness']
    actorProfile_out['ClosenessCentrality'] = cent['Closeness']

    # =====================================================================================
    # ----- FOR DEBUGGING
    # # Save results
    # nx.write_gpickle(G, f"{PATH}{TIME_FRAME}/ssm_weightedGraph_{TIME_FRAME}.gpickle")
    # cent.to_csv(f"{PATH}{TIME_FRAME}/ssm_centrality_{TIME_FRAME}.csv")
    # with open(f"{PATH}{TIME_FRAME}/ssm_cliques_{TIME_FRAME}.pickle", "wb") as file:
    #     pickle.dump(cliques, file)
    # ================================================================================

    return G, actorProfile_out, connected_components