def median_eval_setup(request): """ Sets: * _data["user_ratings_test"] * _process_data["movie_medians_train"] """ # data for worker server _data["user_ratings_test"] = movie_lens_data.get_input_obj( "user_ratings_test") # data for worker processors movie_medians_train = movie_lens_data.get_input_obj("movie_medians_train") _proc.send_same_data({"movie_medians_train": movie_medians_train})
def als_eval_setup(request): """ Sets: * _data["user_ratings_test"] * _process_data["movie_medians_train"] * _process_data["num_item_factors"] * _process_data["als_user_factors"] * _process_data["als_user_ids"] * _process_data["als_movie_factors"] * _process_data["als_movie_ids"] """ factor = request["setup_param"] als_prefix = "als" + str(factor) + "_" # data for worker server _data["user_ratings_test"] = movie_lens_data.get_als_obj( als_prefix + "user_ratings_test") # data for worker processors movie_medians_train = movie_lens_data.get_input_obj("movie_medians_train") als_user_factors = movie_lens_data.get_als_obj(als_prefix + "user_factors") als_user_ids = movie_lens_data.get_als_obj(als_prefix + "user_ids") als_movie_factors = movie_lens_data.get_als_obj(als_prefix + "item_factors") als_movie_ids = movie_lens_data.get_als_obj(als_prefix + "movie_ids") _proc.send_same_data({ "movie_medians_train": movie_medians_train, "num_item_factors": factor, "als_user_factors": als_user_factors, "als_user_ids": als_user_ids, "als_movie_factors": als_movie_factors, "als_movie_ids": als_movie_ids })
def tag_count_eval_setup(request): """ Sets: * _data["user_ratings_train"] * _data["user_ratings_test"] * _process_data["movie_genres"] * _process_data["movie_tags"] * _process_data["tag_counts"] * _process_data["genre_counts"] * _process_data["movie_medians_train"] """ # data for worker server _data["user_ratings_train"] = movie_lens_data.get_input_obj( "user_ratings_train") _data["user_ratings_test"] = movie_lens_data.get_input_obj( "user_ratings_test") # data for worker processors movie_genres = movie_lens_data.get_input_obj("movie_genres") movie_tags = movie_lens_data.get_input_obj("movie_tags") tag_counts = movie_lens_data.get_input_obj("tag_counts") genre_counts = movie_lens_data.get_input_obj("genre_counts") movie_medians_train = movie_lens_data.get_input_obj("movie_medians_train") _proc.send_same_data({ "movie_genres": movie_genres, "movie_tags": movie_tags, "tag_counts": tag_counts, "genre_counts": genre_counts, "movie_medians_train": movie_medians_train })
def build_similar_movies_db_setup(request): """ Sets: * _process_data["buff_point"] * _process_data["buff_limit"] * _process_data["movie_ratings"] * _process_data["movie_genres"] """ # collect the necessary data buff_point = request["setup_param"]["buff_point"] buff_limit = request["setup_param"]["buff_limit"] movie_ratings = movie_lens_data.get_input_obj("movie_ratings") movie_genres = movie_lens_data.get_input_obj("movie_genres") # send data to worker processors _proc.send_same_data({ "buff_point": buff_point, "buff_limit": buff_limit, "movie_ratings": movie_ratings, "movie_genres": movie_genres })
def main(): # redirect output to file to avoid Unicode printing errors print('see output in "title_search_output.txt"') sys.stdout = open("title_search_output.txt", "w", encoding="utf-8") # process command line arguments overwrite = False for arg in sys.argv: if arg == "overwrite": overwrite = True # movie_titles is {movie_id: title} movie_lens_data.read_movies_csv(overwrite) movie_titles = movie_lens_data.get_input_obj("movie_titles") # get "index", either load it from disk, or rebuild it index = None index_file_name = movie_lens_data.out_dir + "title_search_index.bin" if os.path.exists(index_file_name) and overwrite == False: config = IndexerConfig() index = Index(config, index_file_name) else: # build an "Index" object using "movie_titles" config = IndexerConfig() index = Index(config, None) index.build(movie_titles, index_file_name) # print properties of the index index.print_frequent_tokens(100) index.print_frequent_bigrams(50) index.print_non_alpha_num_words() index.print_words_with_ending("s") index.print_words_with_ending("ing") index.print_words_with_ending("ion") # print some searches with the index search(index, "star war", movie_titles) # should match "star wars" search(index, "star trek 2", movie_titles) # should match "star trek ii" search(index, "battle of gods dragon ball", movie_titles) # prioritize "dragon ball" movies search(index, "ad", movie_titles) # should match "a.d" search(index, "shield", movie_titles) # should match "S.H.I.E.L.D."
def main(): length = movie_lens_data.get_input_obj("user_ratings_test_length") # Sends a "tag_ls_eval" command to cluster nodes. if cluster.cluster_info is None: print("Cannot connect to cluster.") return cluster.send_command({ "op": "distribute", "worker_op": "tag_ls_eval", "length": length }) cluster.wait_for_completion() cluster.print_status() print() file_name = "tag_ls_eval_results.bin" results = cluster.merge_list_results(file_name) user_ids, agreements = zip(*results) my_util.print_rank_agreement_results(agreements, "tag least squares") print('\a')
def main(): # process command line arguments overwrite = False cpu_count = None for arg in sys.argv: if arg == "overwrite": overwrite = True elif arg.startswith("cpu_count="): cpu_count = int(arg.split(sep='=')[1]) similar_movies_file_name = config.out_dir + "similar_movies.bin" # exit if object already exists if overwrite == False and os.path.exists(similar_movies_file_name): print(similar_movies_file_name, "already exists") return movie_lens_data.start_time = time.time() # The SimilarMovieFinder class is tuned using: # movie id 1196 - Star Wars: Episode V - The Empire Strikes Back # movie id 1210 - Star Wars: Episode VI - Return of the Jedi movie_id1 = 1196 movie_id2 = 1210 # check movie_id titles - to be sure that the movie_id is still valid movie_lens_data.read_movies_csv() movie_titles = movie_lens_data.get_input_obj("movie_titles") if movie_titles[movie_id1].lower().find("empire strikes back") < 0: print('Movie ID', movie_id1, 'no longer "empire strikes back",', "cannot continue") return if movie_titles[movie_id2].lower().find("return of the jedi") < 0: print('Movie ID', movie_id2, 'no longer "return of the jedi",', "cannot continue") return # create SimilarMovieFinder and tune movie_ratings = movie_lens_data.create_movie_ratings(overwrite) movie_genres = movie_lens_data.get_input_obj("movie_genres") movie_finder = SimilarMovieFinder(movie_genres, movie_ratings) movie_finder.tune(movie_id1, movie_id2, 2, 20) print("SimilarMovieFinder tuning results in buff_point =", movie_finder.buff_point, "buff_limit =", movie_finder.buff_limit) print() print("Movies similar to \"" + movie_titles[movie_id1] + "\":") movie_id1_index = movie_finder.find_movie_index(movie_id1) movie_ids, _ = movie_finder.find_similar_movie(movie_id1_index) for movie_id in movie_ids: print(movie_titles[movie_id]) # build database of similar movies print() print("Starting to build database of similar movies") if cluster.cluster_info is None: build_locally(movie_genres, movie_ratings, movie_finder.buff_point, movie_finder.buff_limit, similar_movies_file_name, cpu_count) else: build_with_cluster(movie_finder.buff_point, movie_finder.buff_limit, len(movie_ratings), "similar_movies.bin")
def __init__(self): self.similar_movies = movie_lens_data.get_output_obj("similar_movies") movie_lens_data.read_movies_csv() self.movie_titles = movie_lens_data.get_input_obj("movie_titles")
def main(): # process command line arguments overwrite = False training_set_ratio = 0.8 cpu_count = None als_thread_count = None algorithm = 1 for arg in sys.argv: if arg == "overwrite": overwrite = True elif arg.startswith("training_set_ratio="): training_set_ratio = float(arg.split(sep='=')[1]) elif arg.startswith("cpu_count="): cpu_count = int(arg.split(sep='=')[1]) elif arg.startswith("als_thread_count="): als_thread_count = int(arg.split(sep='=')[1]) elif arg.startswith("algorithm="): algorithm = int(arg.split(sep='=')[1]) # Start extra processes and shrink the data set to meet ALS factor # requirements. _proc.start_processes(cpu_count) als_factors_list = [3, 5, 7, 9, 11] user_ratings = movie_lens_data.create_user_ratings(overwrite) if training_set_ratio >= 1: # Use the whole data set as training set. # "movie_lens_data.als_data_set_shrink_mp(...)" will assume # "user_ratings_train" and "user_ratings_test" to be in process memory. _proc.split_list_and_send(user_ratings, "user_ratings_train") _proc.send_same_data({"user_ratings_test": None}) # compute movie medians, save to disk as "movie_medians_full.bin" median_file_name = movie_lens_data.in_dir + os.sep + "movie_medians_full.bin" if os.path.exists(median_file_name) and overwrite == False: movie_medians = movie_lens_data.get_input_obj("movie_medians_full") else: movie_ratings = movie_lens_data.create_movie_ratings(overwrite) print(" Computing movie medians for the full data set") _proc.split_list_and_send(movie_ratings, "movie_ratings") _proc.run_function("_compute_medians2", {}) movie_medians = _proc.update_var_into_dict("movie_medians") print(" Saving", median_file_name) with open(median_file_name, mode="bw") as file: pickle.dump(movie_medians, file) movie_lens_data.als_data_set_shrink_mp(movie_medians, als_factors_list, no_test_set=True) else: # training_set_ratio < 1 movie_medians_train = movie_lens_data.refresh_training_sets_mp( user_ratings, training_set_ratio) movie_lens_data.als_data_set_shrink_mp(movie_medians_train, als_factors_list) _proc.end_processes() gc.collect() movie_lens_data.als_train(als_factors_list, als_thread_count, algorithm) # print run time run_time = int(time.time() - movie_lens_data.start_time) print("Total run time", datetime.timedelta(seconds=run_time)) print('\a')