Ejemplo n.º 1
0
def make_full_person_graph(root_user):
    LOGGER.debug(
        "user %s interests are %s",
        root_user.id,
        ", ".join([i.text for i in root_user.interests if len(i.sim_list) > 1]),
    )
    interests = set([i for i in root_user.interests])
    clusters = cluster_user_interests(interests)

    print "person clusters are:"
    for rep, c in clusters.items():
        print "\t%s:%s" % (rep.text, [i.text for i in c])

    weights = dict([(r, 1.0) for r in clusters])
    (users, relations) = find_user_results(clusters.keys(), weights, root_user, clusters)
    for u in users:
        print_user_relations(u, relations[u], "chose user with interests", "\t")
Ejemplo n.º 2
0
def make_full_person_graph(root_user):
    LOGGER.debug(
        'user %s interests are %s', root_user.id,
        ', '.join([i.text for i in root_user.interests
                   if len(i.sim_list) > 1]))
    interests = set([i for i in root_user.interests])
    clusters = cluster_user_interests(interests)

    print 'person clusters are:'
    for rep, c in clusters.items():
        print '\t%s:%s' % (rep.text, [i.text for i in c])

    weights = dict([(r, 1.0) for r in clusters])
    (users, relations) = find_user_results(clusters.keys(), weights, root_user,
                                           clusters)
    for u in users:
        print_user_relations(u, relations[u], 'chose user with interests',
                             '\t')
Ejemplo n.º 3
0
def find_user_results(roots, weights, root_user=None, clusters=None):
    LOGGER.debug("roots are %s", ", ".join([i.text for i in roots]))
    candidate_interests = set()
    rev_cluster_map = {}
    for root, c in clusters.items():
        for interest in c:
            candidate_interests.update(interest.get_similar()[:500])
            rev_cluster_map[interest] = root
    LOGGER.debug("num candidates is %s", len(candidate_interests))

    # generate possible relations
    closest_root = {}
    for i in candidate_interests:
        sims = [(i.get_similarity2(j), j) for j in roots]
        max_sim = max([s for (s, j) in sims])
        if max_sim >= 0.01:
            assert max_sim > 0
            best_roots = [j for (s, j) in sims if s == max_sim]
            closest_root[i] = random.choice(best_roots)

    # similarities for candidate interests to any one of the element in the cluster
    interest_rels = {}
    for (i, root) in closest_root.items():
        interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]])

    # build up relations
    user_relations = {}
    for (related, root) in closest_root.items():
        for user in utils.get_users_with_interest(related):
            if not user in user_relations:
                user_relations[user] = collections.defaultdict(list)
            user_relations[user][root].append(related)

    # sort interests within relations by similarity
    for (user, relations) in user_relations.items():
        for (root, related) in relations.items():
            related.sort(key=lambda i: interest_rels[i])
            related.reverse()

    # score users
    user_relevances = {}
    user_profiles = {}
    for (user, relations) in user_relations.items():
        rel = 0.0
        profile = []
        for root in roots:
            if root in relations:
                sims = [interest_rels[i] for i in relations[root]]
                score = sum([s * (0.5 ** penalty) for (penalty, s) in enumerate(sims)])
                profile.append(len(relations))
                rel += score
            else:
                profile.append(0)
        norm = sum([x * x for x in profile]) ** 0.5
        profile = [x / norm for x in profile]
        user_relevances[user] = rel
        user_profiles[user] = profile

    # choose users
    candidates = set(user_relevances.keys())
    if root_user and root_user in candidates:
        candidates.remove(root_user)

    chosen = set()
    while candidates and len(chosen) < 20:
        # print 'iteration %d, chosen are:' % len(chosen)
        # for u in chosen:
        # print_user_relations(u, user_relations[u], '\tinterests', '\t\t')
        best_user = None
        best_score = None
        # print 'candidates are:'
        for u1 in candidates:
            sims = []
            for u2 in chosen:
                dot = sum([x * y for (x, y) in zip(user_profiles[u1], user_profiles[u2])])
                sims.append(dot)
            sims.sort()
            redundancy = sum(sims[-2:])  # three largest similarities
            score = user_relevances[u1] - redundancy
            # caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy)
            # print_user_relations(u1, user_relations[u1], caption, '\t\t')
            if score > best_score:
                best_score = score
                best_user = u1
        # caption = 'best has score=%.6f' % best_score
        # print_user_relations(best_user, user_relations[best_user], caption, '\t')
        candidates.remove(best_user)
        chosen.add(best_user)

    return chosen, dict([(u, user_relations[u]) for u in chosen])
Ejemplo n.º 4
0
def find_user_results(roots, weights, root_user=None, clusters=None):
    LOGGER.debug('roots are %s', ', '.join([i.text for i in roots]))
    candidate_interests = set()
    rev_cluster_map = {}
    for root, c in clusters.items():
        for interest in c:
            candidate_interests.update(interest.get_similar()[:500])
            rev_cluster_map[interest] = root
    LOGGER.debug('num candidates is %s', len(candidate_interests))

    # generate possible relations
    closest_root = {}
    for i in candidate_interests:
        sims = [(i.get_similarity2(j), j) for j in roots]
        max_sim = max([s for (s, j) in sims])
        if max_sim >= 0.01:
            assert (max_sim > 0)
            best_roots = [j for (s, j) in sims if s == max_sim]
            closest_root[i] = random.choice(best_roots)

    # similarities for candidate interests to any one of the element in the cluster
    interest_rels = {}
    for (i, root) in closest_root.items():
        interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]])

    # build up relations
    user_relations = {}
    for (related, root) in closest_root.items():
        for user in utils.get_users_with_interest(related):
            if not user in user_relations:
                user_relations[user] = collections.defaultdict(list)
            user_relations[user][root].append(related)

    # sort interests within relations by similarity
    for (user, relations) in user_relations.items():
        for (root, related) in relations.items():
            related.sort(key=lambda i: interest_rels[i])
            related.reverse()

    # score users
    user_relevances = {}
    user_profiles = {}
    for (user, relations) in user_relations.items():
        rel = 0.0
        profile = []
        for root in roots:
            if root in relations:
                sims = [interest_rels[i] for i in relations[root]]
                score = sum(
                    [s * (0.5**penalty) for (penalty, s) in enumerate(sims)])
                profile.append(len(relations))
                rel += score
            else:
                profile.append(0)
        norm = sum([x * x for x in profile])**0.5
        profile = [x / norm for x in profile]
        user_relevances[user] = rel
        user_profiles[user] = profile

    # choose users
    candidates = set(user_relevances.keys())
    if root_user and root_user in candidates:
        candidates.remove(root_user)

    chosen = set()
    while candidates and len(chosen) < 20:
        #print 'iteration %d, chosen are:' % len(chosen)
        #for u in chosen:
        #print_user_relations(u, user_relations[u], '\tinterests', '\t\t')
        best_user = None
        best_score = None
        #print 'candidates are:'
        for u1 in candidates:
            sims = []
            for u2 in chosen:
                dot = sum([
                    x * y
                    for (x, y) in zip(user_profiles[u1], user_profiles[u2])
                ])
                sims.append(dot)
            sims.sort()
            redundancy = sum(sims[-2:])  # three largest similarities
            score = user_relevances[u1] - redundancy
            #caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy)
            #print_user_relations(u1, user_relations[u1], caption, '\t\t')
            if score > best_score:
                best_score = score
                best_user = u1
        #caption = 'best has score=%.6f' % best_score
        #print_user_relations(best_user, user_relations[best_user], caption, '\t')
        candidates.remove(best_user)
        chosen.add(best_user)

    return chosen, dict([(u, user_relations[u]) for u in chosen])