def find_similar_repos_jaccard_with_time_weight(from_repo_name, std, num_best = 100): # check cache if cache_find_similar_repos_jaccard_with_time_weight.has_key((from_repo_name, std, num_best)): return cache_find_similar_repos_jaccard_with_time_weight[(from_repo_name, std, num_best)] repo_dict = {} # num_star_of_from_repo = r.scard('repo:' + from_repo_name) num_star_of_from_repo = stars.get(from_repo_name, 0) users = r.smembers('repo:' + from_repo_name) my_norm = scipy.stats.norm(0, std) for user in users: # get the time the user performed the starring event, type: float, millisec since epoch user_star_from_repo_time = r.zscore('user:'******'user:'******'count_common_stars'] = count_weight else: repo_dict[user_starred_repo]['count_common_stars'] += count_weight repo_jaccard_dict = {} for to_repo in repo_dict: count_common_stars = repo_dict[to_repo]['count_common_stars'] # num_star_of_to_repo = r.scard('repo:' + to_repo) num_star_of_to_repo = stars.get(to_repo, 0) jaccard_similarity = 1.0 * count_common_stars / (num_star_of_to_repo + num_star_of_from_repo - count_common_stars) repo_dict[to_repo]['jaccard_similarity'] = jaccard_similarity repo_jaccard_dict[to_repo] = jaccard_similarity top_ranked_repos = heapq.nlargest(NUM_TOP_REPOS, repo_jaccard_dict, key=repo_jaccard_dict.get) result_list = [] for to_repo in top_ranked_repos: result_list.append((to_repo, repo_dict[to_repo]['jaccard_similarity'])) # store result_list to cache cache_find_similar_repos_jaccard_with_time_weight[(from_repo_name, std, num_best)] = result_list return result_list
def find_similar_repos(repo_name, type="lda", num_best=100): if cache.has_key((repo_name, type, num_best)): return cache[(repo_name, type, num_best)] sims = model.query(repo_name, type) sims = [(name, score) for (name, score) in sims if stars.get(name, 0) >= 30] cache[(repo_name, type, num_best)] = sims return sims[:num_best]
def find_similar_repos_jaccard_in_time_range(from_repo_name, time_range_in_day, num_best=100): # check cache if cache_find_similar_repos_jaccard_in_time_range.has_key((from_repo_name, time_range_in_day, num_best)): return cache_find_similar_repos_jaccard_in_time_range[(from_repo_name, time_range_in_day, num_best)] repo_dict = {} # num_star_of_from_repo = r.scard('repo:' + from_repo_name) num_star_of_from_repo = stars.get(from_repo_name, 0) users = r.smembers('repo:' + from_repo_name) for user in users: # get the time the user performed the starring event, type: float, millisec since epoch user_star_from_repo_time = r.zscore('user:'******'user:'******'count_common_stars'] = 1 else: repo_dict[user_starred_repo]['count_common_stars'] += 1 repo_jaccard_dict = {} for to_repo in repo_dict: count_common_stars = repo_dict[to_repo]['count_common_stars'] # num_star_of_to_repo = r.scard('repo:' + to_repo) num_star_of_to_repo = stars.get(to_repo, 0) jaccard_similarity = 1.0 * count_common_stars / (num_star_of_to_repo + num_star_of_from_repo - count_common_stars) repo_dict[to_repo]['jaccard_similarity'] = jaccard_similarity repo_jaccard_dict[to_repo] = jaccard_similarity top_ranked_repos = heapq.nlargest(num_best, repo_jaccard_dict, key=repo_jaccard_dict.get) result_list = [] for to_repo in top_ranked_repos: result_list.append((to_repo, repo_dict[to_repo]['jaccard_similarity'])) # store result_list to cache cache_find_similar_repos_jaccard_in_time_range[(from_repo_name, time_range_in_day, num_best)] = result_list return result_list
def find_similar_repos_jaccard(from_repo_name, num_best = 100): # check cache if cache_find_similar_repos_jaccard.has_key((from_repo_name, num_best)): return cache_find_similar_repos_jaccard[(from_repo_name, num_best)] repo_dict = {} # num_star_of_from_repo = r.scard('repo:' + from_repo_name) num_star_of_from_repo = stars.get(from_repo_name, 0) users = r.smembers('repo:' + from_repo_name) for user in users: # get all repos the user starred user_starred_repos = r.zrange('user:'******'count_common_stars'] = 1 else: repo_dict[user_starred_repo]['count_common_stars'] += 1 repo_jaccard_dict = {} for to_repo in repo_dict: count_common_stars = repo_dict[to_repo]['count_common_stars'] # num_star_of_to_repo = r.scard('repo:' + to_repo) num_star_of_to_repo = stars.get(to_repo, 0) jaccard_similarity = 1.0 * count_common_stars / (num_star_of_to_repo + num_star_of_from_repo - count_common_stars) repo_dict[to_repo]['jaccard_similarity'] = jaccard_similarity repo_jaccard_dict[to_repo] = jaccard_similarity top_ranked_repos = heapq.nlargest(num_best, repo_jaccard_dict, key=repo_jaccard_dict.get) result_list = [] for to_repo in top_ranked_repos: result_list.append((to_repo, repo_dict[to_repo]['jaccard_similarity'])) # store result_list to cache cache_find_similar_repos_jaccard[(from_repo_name, num_best)] = result_list return result_list