def make_full_person_graph(root_user): LOGGER.debug( "user %s interests are %s", root_user.id, ", ".join([i.text for i in root_user.interests if len(i.sim_list) > 1]), ) interests = set([i for i in root_user.interests]) clusters = cluster_user_interests(interests) print "person clusters are:" for rep, c in clusters.items(): print "\t%s:%s" % (rep.text, [i.text for i in c]) weights = dict([(r, 1.0) for r in clusters]) (users, relations) = find_user_results(clusters.keys(), weights, root_user, clusters) for u in users: print_user_relations(u, relations[u], "chose user with interests", "\t")
def make_full_person_graph(root_user): LOGGER.debug( 'user %s interests are %s', root_user.id, ', '.join([i.text for i in root_user.interests if len(i.sim_list) > 1])) interests = set([i for i in root_user.interests]) clusters = cluster_user_interests(interests) print 'person clusters are:' for rep, c in clusters.items(): print '\t%s:%s' % (rep.text, [i.text for i in c]) weights = dict([(r, 1.0) for r in clusters]) (users, relations) = find_user_results(clusters.keys(), weights, root_user, clusters) for u in users: print_user_relations(u, relations[u], 'chose user with interests', '\t')
def find_user_results(roots, weights, root_user=None, clusters=None): LOGGER.debug("roots are %s", ", ".join([i.text for i in roots])) candidate_interests = set() rev_cluster_map = {} for root, c in clusters.items(): for interest in c: candidate_interests.update(interest.get_similar()[:500]) rev_cluster_map[interest] = root LOGGER.debug("num candidates is %s", len(candidate_interests)) # generate possible relations closest_root = {} for i in candidate_interests: sims = [(i.get_similarity2(j), j) for j in roots] max_sim = max([s for (s, j) in sims]) if max_sim >= 0.01: assert max_sim > 0 best_roots = [j for (s, j) in sims if s == max_sim] closest_root[i] = random.choice(best_roots) # similarities for candidate interests to any one of the element in the cluster interest_rels = {} for (i, root) in closest_root.items(): interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]]) # build up relations user_relations = {} for (related, root) in closest_root.items(): for user in utils.get_users_with_interest(related): if not user in user_relations: user_relations[user] = collections.defaultdict(list) user_relations[user][root].append(related) # sort interests within relations by similarity for (user, relations) in user_relations.items(): for (root, related) in relations.items(): related.sort(key=lambda i: interest_rels[i]) related.reverse() # score users user_relevances = {} user_profiles = {} for (user, relations) in user_relations.items(): rel = 0.0 profile = [] for root in roots: if root in relations: sims = [interest_rels[i] for i in relations[root]] score = sum([s * (0.5 ** penalty) for (penalty, s) in enumerate(sims)]) profile.append(len(relations)) rel += score else: profile.append(0) norm = sum([x * x for x in profile]) ** 0.5 profile = [x / norm for x in profile] user_relevances[user] = rel user_profiles[user] = profile # choose users candidates = set(user_relevances.keys()) if root_user and root_user in candidates: candidates.remove(root_user) chosen = set() while candidates and len(chosen) < 20: # print 'iteration %d, chosen are:' % len(chosen) # for u in chosen: # print_user_relations(u, user_relations[u], '\tinterests', '\t\t') best_user = None best_score = None # print 'candidates are:' for u1 in candidates: sims = [] for u2 in chosen: dot = sum([x * y for (x, y) in zip(user_profiles[u1], user_profiles[u2])]) sims.append(dot) sims.sort() redundancy = sum(sims[-2:]) # three largest similarities score = user_relevances[u1] - redundancy # caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy) # print_user_relations(u1, user_relations[u1], caption, '\t\t') if score > best_score: best_score = score best_user = u1 # caption = 'best has score=%.6f' % best_score # print_user_relations(best_user, user_relations[best_user], caption, '\t') candidates.remove(best_user) chosen.add(best_user) return chosen, dict([(u, user_relations[u]) for u in chosen])
def find_user_results(roots, weights, root_user=None, clusters=None): LOGGER.debug('roots are %s', ', '.join([i.text for i in roots])) candidate_interests = set() rev_cluster_map = {} for root, c in clusters.items(): for interest in c: candidate_interests.update(interest.get_similar()[:500]) rev_cluster_map[interest] = root LOGGER.debug('num candidates is %s', len(candidate_interests)) # generate possible relations closest_root = {} for i in candidate_interests: sims = [(i.get_similarity2(j), j) for j in roots] max_sim = max([s for (s, j) in sims]) if max_sim >= 0.01: assert (max_sim > 0) best_roots = [j for (s, j) in sims if s == max_sim] closest_root[i] = random.choice(best_roots) # similarities for candidate interests to any one of the element in the cluster interest_rels = {} for (i, root) in closest_root.items(): interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]]) # build up relations user_relations = {} for (related, root) in closest_root.items(): for user in utils.get_users_with_interest(related): if not user in user_relations: user_relations[user] = collections.defaultdict(list) user_relations[user][root].append(related) # sort interests within relations by similarity for (user, relations) in user_relations.items(): for (root, related) in relations.items(): related.sort(key=lambda i: interest_rels[i]) related.reverse() # score users user_relevances = {} user_profiles = {} for (user, relations) in user_relations.items(): rel = 0.0 profile = [] for root in roots: if root in relations: sims = [interest_rels[i] for i in relations[root]] score = sum( [s * (0.5**penalty) for (penalty, s) in enumerate(sims)]) profile.append(len(relations)) rel += score else: profile.append(0) norm = sum([x * x for x in profile])**0.5 profile = [x / norm for x in profile] user_relevances[user] = rel user_profiles[user] = profile # choose users candidates = set(user_relevances.keys()) if root_user and root_user in candidates: candidates.remove(root_user) chosen = set() while candidates and len(chosen) < 20: #print 'iteration %d, chosen are:' % len(chosen) #for u in chosen: #print_user_relations(u, user_relations[u], '\tinterests', '\t\t') best_user = None best_score = None #print 'candidates are:' for u1 in candidates: sims = [] for u2 in chosen: dot = sum([ x * y for (x, y) in zip(user_profiles[u1], user_profiles[u2]) ]) sims.append(dot) sims.sort() redundancy = sum(sims[-2:]) # three largest similarities score = user_relevances[u1] - redundancy #caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy) #print_user_relations(u1, user_relations[u1], caption, '\t\t') if score > best_score: best_score = score best_user = u1 #caption = 'best has score=%.6f' % best_score #print_user_relations(best_user, user_relations[best_user], caption, '\t') candidates.remove(best_user) chosen.add(best_user) return chosen, dict([(u, user_relations[u]) for u in chosen])