Esempio n. 1
0
def make_full_interest_graph(root_interest):
    cluster_map = clusters.make_interest_graph(root_interest)
    closest_clusters = find_closest_clusters(cluster_map['map'].keys())
    candidates = set()

    for root in cluster_map['map']:
        for i in [root] + root.get_similar():
            for u in utils.get_users_with_interest(i):
                if u not in candidates:
                    u.set_cluster_counts(closest_clusters)
                    candidates.add(u)

    LOGGER.debug('found %d candidates for interest %s' %
                 (len(candidates), root_interest))

    closest_counts = collections.defaultdict(int)
    for u in candidates:
        primary = u.get_primary_clusters()
        if len(primary) == 1:
            closest_counts[primary[0]] += 1

    LOGGER.debug('scoring candidates...')
    weights = {}
    for i in cluster_map['map']:
        weights[i] = (2.0 if i == root_interest else 0.5)
    scores = {}
    for u in candidates:
        scores[u] = get_relevance(u, closest_clusters, weights)
    LOGGER.debug('finished scoring candidates...')

    DECAY = 0.7
    weights = collections.defaultdict(lambda: 2.0)
    results = []
    while candidates and len(results) < NUM_USERS:
        u = choose_candidate(candidates, scores, weights)
        candidates.remove(u)
        results.append(u)
        primaries = u.get_primary_clusters()
        for c in primaries:
            weights[c] *= (1.0 - (1.0 - DECAY) / len(primaries))

    for u in results:
        show_candidate(u, closest_clusters, weights, scores[u])
Esempio n. 2
0
def make_full_interest_graph(root_interest):
    cluster_map = clusters.make_interest_graph(root_interest)
    closest_clusters = find_closest_clusters(cluster_map["map"].keys())
    candidates = set()

    for root in cluster_map["map"]:
        for i in [root] + root.get_similar():
            for u in utils.get_users_with_interest(i):
                if u not in candidates:
                    u.set_cluster_counts(closest_clusters)
                    candidates.add(u)

    LOGGER.debug("found %d candidates for interest %s" % (len(candidates), root_interest))

    closest_counts = collections.defaultdict(int)
    for u in candidates:
        primary = u.get_primary_clusters()
        if len(primary) == 1:
            closest_counts[primary[0]] += 1

    LOGGER.debug("scoring candidates...")
    weights = {}
    for i in cluster_map["map"]:
        weights[i] = 2.0 if i == root_interest else 0.5
    scores = {}
    for u in candidates:
        scores[u] = get_relevance(u, closest_clusters, weights)
    LOGGER.debug("finished scoring candidates...")

    DECAY = 0.7
    weights = collections.defaultdict(lambda: 2.0)
    results = []
    while candidates and len(results) < NUM_USERS:
        u = choose_candidate(candidates, scores, weights)
        candidates.remove(u)
        results.append(u)
        primaries = u.get_primary_clusters()
        for c in primaries:
            weights[c] *= 1.0 - (1.0 - DECAY) / len(primaries)

    for u in results:
        show_candidate(u, closest_clusters, weights, scores[u])
Esempio n. 3
0
def find_user_results(roots, weights, root_user=None, clusters=None):
    LOGGER.debug("roots are %s", ", ".join([i.text for i in roots]))
    candidate_interests = set()
    rev_cluster_map = {}
    for root, c in clusters.items():
        for interest in c:
            candidate_interests.update(interest.get_similar()[:500])
            rev_cluster_map[interest] = root
    LOGGER.debug("num candidates is %s", len(candidate_interests))

    # generate possible relations
    closest_root = {}
    for i in candidate_interests:
        sims = [(i.get_similarity2(j), j) for j in roots]
        max_sim = max([s for (s, j) in sims])
        if max_sim >= 0.01:
            assert max_sim > 0
            best_roots = [j for (s, j) in sims if s == max_sim]
            closest_root[i] = random.choice(best_roots)

    # similarities for candidate interests to any one of the element in the cluster
    interest_rels = {}
    for (i, root) in closest_root.items():
        interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]])

    # build up relations
    user_relations = {}
    for (related, root) in closest_root.items():
        for user in utils.get_users_with_interest(related):
            if not user in user_relations:
                user_relations[user] = collections.defaultdict(list)
            user_relations[user][root].append(related)

    # sort interests within relations by similarity
    for (user, relations) in user_relations.items():
        for (root, related) in relations.items():
            related.sort(key=lambda i: interest_rels[i])
            related.reverse()

    # score users
    user_relevances = {}
    user_profiles = {}
    for (user, relations) in user_relations.items():
        rel = 0.0
        profile = []
        for root in roots:
            if root in relations:
                sims = [interest_rels[i] for i in relations[root]]
                score = sum([s * (0.5 ** penalty) for (penalty, s) in enumerate(sims)])
                profile.append(len(relations))
                rel += score
            else:
                profile.append(0)
        norm = sum([x * x for x in profile]) ** 0.5
        profile = [x / norm for x in profile]
        user_relevances[user] = rel
        user_profiles[user] = profile

    # choose users
    candidates = set(user_relevances.keys())
    if root_user and root_user in candidates:
        candidates.remove(root_user)

    chosen = set()
    while candidates and len(chosen) < 20:
        # print 'iteration %d, chosen are:' % len(chosen)
        # for u in chosen:
        # print_user_relations(u, user_relations[u], '\tinterests', '\t\t')
        best_user = None
        best_score = None
        # print 'candidates are:'
        for u1 in candidates:
            sims = []
            for u2 in chosen:
                dot = sum([x * y for (x, y) in zip(user_profiles[u1], user_profiles[u2])])
                sims.append(dot)
            sims.sort()
            redundancy = sum(sims[-2:])  # three largest similarities
            score = user_relevances[u1] - redundancy
            # caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy)
            # print_user_relations(u1, user_relations[u1], caption, '\t\t')
            if score > best_score:
                best_score = score
                best_user = u1
        # caption = 'best has score=%.6f' % best_score
        # print_user_relations(best_user, user_relations[best_user], caption, '\t')
        candidates.remove(best_user)
        chosen.add(best_user)

    return chosen, dict([(u, user_relations[u]) for u in chosen])
Esempio n. 4
0
def find_user_results(roots, weights, root_user=None, clusters=None):
    LOGGER.debug('roots are %s', ', '.join([i.text for i in roots]))
    candidate_interests = set()
    rev_cluster_map = {}
    for root, c in clusters.items():
        for interest in c:
            candidate_interests.update(interest.get_similar()[:500])
            rev_cluster_map[interest] = root
    LOGGER.debug('num candidates is %s', len(candidate_interests))

    # generate possible relations
    closest_root = {}
    for i in candidate_interests:
        sims = [(i.get_similarity2(j), j) for j in roots]
        max_sim = max([s for (s, j) in sims])
        if max_sim >= 0.01:
            assert (max_sim > 0)
            best_roots = [j for (s, j) in sims if s == max_sim]
            closest_root[i] = random.choice(best_roots)

    # similarities for candidate interests to any one of the element in the cluster
    interest_rels = {}
    for (i, root) in closest_root.items():
        interest_rels[i] = max([i.get_similarity2(j) for j in clusters[root]])

    # build up relations
    user_relations = {}
    for (related, root) in closest_root.items():
        for user in utils.get_users_with_interest(related):
            if not user in user_relations:
                user_relations[user] = collections.defaultdict(list)
            user_relations[user][root].append(related)

    # sort interests within relations by similarity
    for (user, relations) in user_relations.items():
        for (root, related) in relations.items():
            related.sort(key=lambda i: interest_rels[i])
            related.reverse()

    # score users
    user_relevances = {}
    user_profiles = {}
    for (user, relations) in user_relations.items():
        rel = 0.0
        profile = []
        for root in roots:
            if root in relations:
                sims = [interest_rels[i] for i in relations[root]]
                score = sum(
                    [s * (0.5**penalty) for (penalty, s) in enumerate(sims)])
                profile.append(len(relations))
                rel += score
            else:
                profile.append(0)
        norm = sum([x * x for x in profile])**0.5
        profile = [x / norm for x in profile]
        user_relevances[user] = rel
        user_profiles[user] = profile

    # choose users
    candidates = set(user_relevances.keys())
    if root_user and root_user in candidates:
        candidates.remove(root_user)

    chosen = set()
    while candidates and len(chosen) < 20:
        #print 'iteration %d, chosen are:' % len(chosen)
        #for u in chosen:
        #print_user_relations(u, user_relations[u], '\tinterests', '\t\t')
        best_user = None
        best_score = None
        #print 'candidates are:'
        for u1 in candidates:
            sims = []
            for u2 in chosen:
                dot = sum([
                    x * y
                    for (x, y) in zip(user_profiles[u1], user_profiles[u2])
                ])
                sims.append(dot)
            sims.sort()
            redundancy = sum(sims[-2:])  # three largest similarities
            score = user_relevances[u1] - redundancy
            #caption = '\tcandidate score=%.6f, sim=%.6f redund=%.6f' % (score, user_relevances[u1], redundancy)
            #print_user_relations(u1, user_relations[u1], caption, '\t\t')
            if score > best_score:
                best_score = score
                best_user = u1
        #caption = 'best has score=%.6f' % best_score
        #print_user_relations(best_user, user_relations[best_user], caption, '\t')
        candidates.remove(best_user)
        chosen.add(best_user)

    return chosen, dict([(u, user_relations[u]) for u in chosen])