def make_full_interest_graph(root_interest): cluster_map = clusters.make_interest_graph(root_interest) closest_clusters = find_closest_clusters(cluster_map['map'].keys()) candidates = set() for root in cluster_map['map']: for i in [root] + root.get_similar(): for u in utils.get_users_with_interest(i): if u not in candidates: u.set_cluster_counts(closest_clusters) candidates.add(u) LOGGER.debug('found %d candidates for interest %s' % (len(candidates), root_interest)) closest_counts = collections.defaultdict(int) for u in candidates: primary = u.get_primary_clusters() if len(primary) == 1: closest_counts[primary[0]] += 1 LOGGER.debug('scoring candidates...') weights = {} for i in cluster_map['map']: weights[i] = (2.0 if i == root_interest else 0.5) scores = {} for u in candidates: scores[u] = get_relevance(u, closest_clusters, weights) LOGGER.debug('finished scoring candidates...') DECAY = 0.7 weights = collections.defaultdict(lambda: 2.0) results = [] while candidates and len(results) < NUM_USERS: u = choose_candidate(candidates, scores, weights) candidates.remove(u) results.append(u) primaries = u.get_primary_clusters() for c in primaries: weights[c] *= (1.0 - (1.0 - DECAY) / len(primaries)) for u in results: show_candidate(u, closest_clusters, weights, scores[u])
def make_full_interest_graph(root_interest): cluster_map = clusters.make_interest_graph(root_interest) closest_clusters = find_closest_clusters(cluster_map["map"].keys()) candidates = set() for root in cluster_map["map"]: for i in [root] + root.get_similar(): for u in utils.get_users_with_interest(i): if u not in candidates: u.set_cluster_counts(closest_clusters) candidates.add(u) LOGGER.debug("found %d candidates for interest %s" % (len(candidates), root_interest)) closest_counts = collections.defaultdict(int) for u in candidates: primary = u.get_primary_clusters() if len(primary) == 1: closest_counts[primary[0]] += 1 LOGGER.debug("scoring candidates...") weights = {} for i in cluster_map["map"]: weights[i] = 2.0 if i == root_interest else 0.5 scores = {} for u in candidates: scores[u] = get_relevance(u, closest_clusters, weights) LOGGER.debug("finished scoring candidates...") DECAY = 0.7 weights = collections.defaultdict(lambda: 2.0) results = [] while candidates and len(results) < NUM_USERS: u = choose_candidate(candidates, scores, weights) candidates.remove(u) results.append(u) primaries = u.get_primary_clusters() for c in primaries: weights[c] *= 1.0 - (1.0 - DECAY) / len(primaries) for u in results: show_candidate(u, closest_clusters, weights, scores[u])
def find_user_results(roots, weights, root_user=None, clusters=None): LOGGER.debug("roots are %s", ", ".join([i.text for i in roots])) candidate_interests = set() rev_cluster_map = {} for root, c in clusters.items(): for interest in c: candidate_interests.update(interest.get_similar()[:500]) rev_cluster_map[interest] = root LOGGER.debug("num candidates is %s", len(candidate_interests)) # generate possible relations closest_root = {} for i in candidate_interests: sims = [(i.get_similarity2(j), j) for j in roots] max_sim = max([s for (s, j) in sims]) if max_sim >= 0.01: assert max_sim > 0 best_roots = [j for (s, j) in sims if s == max_sim] closest_root[i] = random.choice(best_roots) # similarities for candidate interests to any one of the element in the cluster interest_rels = {} for (i, root) in closest_root.items(): interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]]) # build up relations user_relations = {} for (related, root) in closest_root.items(): for user in utils.get_users_with_interest(related): if not user in user_relations: user_relations[user] = collections.defaultdict(list) user_relations[user][root].append(related) # sort interests within relations by similarity for (user, relations) in user_relations.items(): for (root, related) in relations.items(): related.sort(key=lambda i: interest_rels[i]) related.reverse() # score users user_relevances = {} user_profiles = {} for (user, relations) in user_relations.items(): rel = 0.0 profile = [] for root in roots: if root in relations: sims = [interest_rels[i] for i in relations[root]] score = sum([s * (0.5 ** penalty) for (penalty, s) in enumerate(sims)]) profile.append(len(relations)) rel += score else: profile.append(0) norm = sum([x * x for x in profile]) ** 0.5 profile = [x / norm for x in profile] user_relevances[user] = rel user_profiles[user] = profile # choose users candidates = set(user_relevances.keys()) if root_user and root_user in candidates: candidates.remove(root_user) chosen = set() while candidates and len(chosen) < 20: # print 'iteration %d, chosen are:' % len(chosen) # for u in chosen: # print_user_relations(u, user_relations[u], '\tinterests', '\t\t') best_user = None best_score = None # print 'candidates are:' for u1 in candidates: sims = [] for u2 in chosen: dot = sum([x * y for (x, y) in zip(user_profiles[u1], user_profiles[u2])]) sims.append(dot) sims.sort() redundancy = sum(sims[-2:]) # three largest similarities score = user_relevances[u1] - redundancy # caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy) # print_user_relations(u1, user_relations[u1], caption, '\t\t') if score > best_score: best_score = score best_user = u1 # caption = 'best has score=%.6f' % best_score # print_user_relations(best_user, user_relations[best_user], caption, '\t') candidates.remove(best_user) chosen.add(best_user) return chosen, dict([(u, user_relations[u]) for u in chosen])
def find_user_results(roots, weights, root_user=None, clusters=None): LOGGER.debug('roots are %s', ', '.join([i.text for i in roots])) candidate_interests = set() rev_cluster_map = {} for root, c in clusters.items(): for interest in c: candidate_interests.update(interest.get_similar()[:500]) rev_cluster_map[interest] = root LOGGER.debug('num candidates is %s', len(candidate_interests)) # generate possible relations closest_root = {} for i in candidate_interests: sims = [(i.get_similarity2(j), j) for j in roots] max_sim = max([s for (s, j) in sims]) if max_sim >= 0.01: assert (max_sim > 0) best_roots = [j for (s, j) in sims if s == max_sim] closest_root[i] = random.choice(best_roots) # similarities for candidate interests to any one of the element in the cluster interest_rels = {} for (i, root) in closest_root.items(): interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]]) # build up relations user_relations = {} for (related, root) in closest_root.items(): for user in utils.get_users_with_interest(related): if not user in user_relations: user_relations[user] = collections.defaultdict(list) user_relations[user][root].append(related) # sort interests within relations by similarity for (user, relations) in user_relations.items(): for (root, related) in relations.items(): related.sort(key=lambda i: interest_rels[i]) related.reverse() # score users user_relevances = {} user_profiles = {} for (user, relations) in user_relations.items(): rel = 0.0 profile = [] for root in roots: if root in relations: sims = [interest_rels[i] for i in relations[root]] score = sum( [s * (0.5**penalty) for (penalty, s) in enumerate(sims)]) profile.append(len(relations)) rel += score else: profile.append(0) norm = sum([x * x for x in profile])**0.5 profile = [x / norm for x in profile] user_relevances[user] = rel user_profiles[user] = profile # choose users candidates = set(user_relevances.keys()) if root_user and root_user in candidates: candidates.remove(root_user) chosen = set() while candidates and len(chosen) < 20: #print 'iteration %d, chosen are:' % len(chosen) #for u in chosen: #print_user_relations(u, user_relations[u], '\tinterests', '\t\t') best_user = None best_score = None #print 'candidates are:' for u1 in candidates: sims = [] for u2 in chosen: dot = sum([ x * y for (x, y) in zip(user_profiles[u1], user_profiles[u2]) ]) sims.append(dot) sims.sort() redundancy = sum(sims[-2:]) # three largest similarities score = user_relevances[u1] - redundancy #caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy) #print_user_relations(u1, user_relations[u1], caption, '\t\t') if score > best_score: best_score = score best_user = u1 #caption = 'best has score=%.6f' % best_score #print_user_relations(best_user, user_relations[best_user], caption, '\t') candidates.remove(best_user) chosen.add(best_user) return chosen, dict([(u, user_relations[u]) for u in chosen])