def main(): print("Initializing Friend Graph...") friend_graph = init_friend_graph('../data/yelpFriends.txt') user_data_directory = os.getcwd() + "/../data/yelp_users/" print("Reading reviews from each user...") for user in os.listdir(user_data_directory): filename = os.getcwd() + "/../data/yelp_users/" + user readUserReviews(filename) restaurant_dict = generate_cosine_similarity_matrix() print("initial length of restaurant dictionary: " + str(len(restaurant_dict))) #filter out restaurants that will lead to null chi square values print("Filtering out null chi square value restaurants...") filtered_restaurants = {k:v for (k,v) in restaurant_dict.items() if not is_chi_square_null(v, friend_graph)} print("After filtering out null chi square restaurants: " + str(len(filtered_restaurants))) print("Generating pickle object...") pickle_object = open("cosSimilarityNoNullRestaurants.pkl", "wb") pickle.dump(filtered_restaurants, pickle_object) pickle_object.close() print("Done. Quitting...")
def main(): rest_user_ratings_map = read_restaurant_by_user_ratings() friend_graph = init_friend_graph('../data/yelpFriends.txt') filtered_restaurants = { k: v for (k, v) in rest_user_ratings_map.items() if not isSubgraphNull(v, friend_graph) } print(len(filtered_restaurants)) restaurant_file = os.getcwd() + "/../data/equalRatingNoNullRestaurants.txt" with open(restaurant_file, 'w') as f: json.dump(filtered_restaurants, f)
def main(): rest_user_reviews_map = read_restaurant_by_user_reviews() num_restaurants = len(rest_user_reviews_map) friend_graph = init_friend_graph('../data/yelpFriends.txt') # A list of lists, where each list corresponds to a restaurant subgraph # Each list contains RAND_NUM chi sq values generated from permuting the graph random_chisq_values = [] # Running sum of the chi sq values for each restaurant of the real graph real_chisq_sum = 0 counter = 0 for r_id in rest_user_reviews_map: user_reviews = rest_user_reviews_map[r_id] combinations = get_user_combos_from_ratings_map(user_reviews['users']) # generate random chisq values first: chisq_vals = gen_random_chisq_vals(user_reviews, friend_graph, combinations) #print("calculated " + str(counter) + " of " + str(num_restaurants)) counter += 1 random_chisq_values.append(chisq_vals) # calculate chisq value for actual graph real_chisq_sum += calc_chi_sq(user_reviews['users'], user_reviews['cos_matrix'], friend_graph, combinations) # list of chisq sums for RAND_NUM trials sum_rand_chisq = [sum(i) for i in zip(*random_chisq_values)] sum_rand_chisq.sort() for num in sum_rand_chisq: print(str(num)) print(real_chisq_sum) # calculate what percentile the real sum is in compared to the distribution of random percentile = 1 for i in range(len(sum_rand_chisq)): if sum_rand_chisq[i] > real_chisq_sum: percentile = i / len(sum_rand_chisq) break print("Percentile: " + str(percentile))
def main(): print("Initializing friend graph...") friend_graph = init_friend_graph('../data/yelpFriends.txt') friend_edges, user_to_index, index_to_user = friend_edge_list(friend_graph) # iterate through users and read in reviews user_data_directory = os.getcwd() + "/../data/yelp_users/" print("Reading reviews from each user...") for user in os.listdir(user_data_directory): filename = os.getcwd() + "/../data/yelp_users/" + user readUserReviews(filename) print("Generating word matrix...") word_labels, user_word_matrix = generate_user_word_matrix(index_to_user) print(len(word_labels)) print(word_labels) users_size = len(user_to_index) friend_edges_size = len(friend_edges) word_edges, word_to_index = word_user_edge_list(word_labels, user_word_matrix) print("Writing edge lists...") # write the friend list with open('../data/friend_edge_list.txt', 'w') as f: f.write(str(users_size) + " " + str(friend_edges_size) + "\n") for edge in friend_edges: f.write(str(edge[0]) + " " + str(edge[1]) + "\n") # combine the edges for friends and words together: all_edges = friend_edges + word_edges nodes_size = users_size + len(word_to_index) edges_size = len(all_edges) with open('../data/friend_word_edge_list.txt', 'w') as f: f.write(str(nodes_size) + " " + str(edges_size) + "\n") for edge in all_edges: f.write(str(edge[0]) + " " + str(edge[1]) + "\n")
def main(): print("Initializing Friend Graph...") friend_graph = init_friend_graph('../data/yelpFriends.txt') index_to_user = list(friend_graph.keys()) user_to_index = {k: v for v, k in enumerate(index_to_user)} V = len(index_to_user) friend_matrix = np.full((V,V), False) for user1, friend_list in friend_graph.items(): for user2 in friend_list: idx1 = user_to_index.get(user1) idx2 = user_to_index.get(user2) if (idx1 is not None and idx2 is not None): friend_matrix[idx1, idx2] = True friend_matrix[idx2, idx2] = True user_data_directory = os.getcwd() + "/../data/yelp_users/" print("Reading reviews from each user...") for user in os.listdir(user_data_directory): filename = os.getcwd() + "/../data/yelp_users/" + user readUserReviews(filename) word_labels, user_word_matrix = generate_user_word_matrix(index_to_user) print(user_word_matrix.shape) result = { 'user_labels': index_to_user, 'word_labels': word_labels, 'friend_matrix': friend_matrix, 'user_word_matrix': user_word_matrix, } print("Generating pickle object...") pickle_object = open("../data/autoencoder_sim_matrices.pkl", "wb") pickle.dump(result, pickle_object) pickle_object.close() print("Done. Quitting...")