Example #1
0
def find_similar_repos_jaccard_with_time_weight(from_repo_name, std, num_best = 100):
    # check cache
    if cache_find_similar_repos_jaccard_with_time_weight.has_key((from_repo_name, std, num_best)):
        return cache_find_similar_repos_jaccard_with_time_weight[(from_repo_name, std, num_best)]

    repo_dict = {}
    # num_star_of_from_repo = r.scard('repo:' + from_repo_name)
    num_star_of_from_repo = stars.get(from_repo_name, 0)
    users = r.smembers('repo:' + from_repo_name)
    my_norm = scipy.stats.norm(0, std)
    for user in users:
        # get the time the user performed the starring event, type: float, millisec since epoch
        user_star_from_repo_time = r.zscore('user:'******'user:'******'count_common_stars'] = count_weight
            else:
                repo_dict[user_starred_repo]['count_common_stars'] += count_weight

    repo_jaccard_dict = {}
    for to_repo in repo_dict:
        count_common_stars = repo_dict[to_repo]['count_common_stars']
        # num_star_of_to_repo = r.scard('repo:' + to_repo)
        num_star_of_to_repo = stars.get(to_repo, 0)
        jaccard_similarity = 1.0 * count_common_stars / (num_star_of_to_repo + num_star_of_from_repo - count_common_stars)
        repo_dict[to_repo]['jaccard_similarity'] = jaccard_similarity
        repo_jaccard_dict[to_repo] = jaccard_similarity

    top_ranked_repos = heapq.nlargest(NUM_TOP_REPOS, repo_jaccard_dict, key=repo_jaccard_dict.get)
    
    result_list = []
    for to_repo in top_ranked_repos:
        result_list.append((to_repo, repo_dict[to_repo]['jaccard_similarity']))

    # store result_list to cache
    cache_find_similar_repos_jaccard_with_time_weight[(from_repo_name, std, num_best)] = result_list

    return result_list
def find_similar_repos(repo_name, type="lda", num_best=100):
    if cache.has_key((repo_name, type, num_best)):
        return cache[(repo_name, type, num_best)]
    sims = model.query(repo_name, type)
    sims = [(name, score) for (name, score) in sims if stars.get(name, 0) >= 30]
    cache[(repo_name, type, num_best)] = sims
    return sims[:num_best]
Example #3
0
def find_similar_repos_jaccard_in_time_range(from_repo_name, time_range_in_day, num_best=100):
    # check cache
    if cache_find_similar_repos_jaccard_in_time_range.has_key((from_repo_name, time_range_in_day, num_best)):
        return cache_find_similar_repos_jaccard_in_time_range[(from_repo_name, time_range_in_day, num_best)]

    repo_dict = {}
    # num_star_of_from_repo = r.scard('repo:' + from_repo_name)
    num_star_of_from_repo = stars.get(from_repo_name, 0)
    users = r.smembers('repo:' + from_repo_name)
    for user in users:
        # get the time the user performed the starring event, type: float, millisec since epoch
        user_star_from_repo_time = r.zscore('user:'******'user:'******'count_common_stars'] = 1
            else:
                repo_dict[user_starred_repo]['count_common_stars'] += 1

    repo_jaccard_dict = {}
    for to_repo in repo_dict:
        count_common_stars = repo_dict[to_repo]['count_common_stars']
        # num_star_of_to_repo = r.scard('repo:' + to_repo)
        num_star_of_to_repo = stars.get(to_repo, 0)
        jaccard_similarity = 1.0 * count_common_stars / (num_star_of_to_repo + num_star_of_from_repo - count_common_stars)
        repo_dict[to_repo]['jaccard_similarity'] = jaccard_similarity
        repo_jaccard_dict[to_repo] = jaccard_similarity

    top_ranked_repos = heapq.nlargest(num_best, repo_jaccard_dict, key=repo_jaccard_dict.get)
    
    result_list = []
    for to_repo in top_ranked_repos:
        result_list.append((to_repo, repo_dict[to_repo]['jaccard_similarity']))

    # store result_list to cache
    cache_find_similar_repos_jaccard_in_time_range[(from_repo_name, time_range_in_day, num_best)] = result_list

    return result_list
Example #4
0
def find_similar_repos_jaccard(from_repo_name, num_best = 100):
    # check cache
    if cache_find_similar_repos_jaccard.has_key((from_repo_name, num_best)):
        return cache_find_similar_repos_jaccard[(from_repo_name, num_best)]

    repo_dict = {}
    # num_star_of_from_repo = r.scard('repo:' + from_repo_name)
    num_star_of_from_repo = stars.get(from_repo_name, 0)
    users = r.smembers('repo:' + from_repo_name)
    for user in users:
        # get all repos the user starred
        user_starred_repos = r.zrange('user:'******'count_common_stars'] = 1
            else:
                repo_dict[user_starred_repo]['count_common_stars'] += 1

    repo_jaccard_dict = {}
    for to_repo in repo_dict:
        count_common_stars = repo_dict[to_repo]['count_common_stars']
        # num_star_of_to_repo = r.scard('repo:' + to_repo)
        num_star_of_to_repo = stars.get(to_repo, 0)
        jaccard_similarity = 1.0 * count_common_stars / (num_star_of_to_repo + num_star_of_from_repo - count_common_stars)
        repo_dict[to_repo]['jaccard_similarity'] = jaccard_similarity
        repo_jaccard_dict[to_repo] = jaccard_similarity

    top_ranked_repos = heapq.nlargest(num_best, repo_jaccard_dict, key=repo_jaccard_dict.get)
    
    result_list = []
    for to_repo in top_ranked_repos:
        result_list.append((to_repo, repo_dict[to_repo]['jaccard_similarity']))

    # store result_list to cache
    cache_find_similar_repos_jaccard[(from_repo_name, num_best)] = result_list

    return result_list