def make_list_of_games_to_scrape(): ''' Returns the list of files to work on scraping. ''' # make a list of pending requests starting with a list of all games temp = GameIndexer() pending_game_list = temp.return_list_of_all_apps() # get list of completed games completed_games = get_completed_games() print print "STARTING SIZE OF PENDING GAME LIST:", len(pending_game_list) print # remove completed games from pending game list for game in completed_games: # ensure this game is in the list (weirdly occurs sometimes) count = pending_game_list.count(game) if count != 0: pending_game_list.pop(pending_game_list.index(game)) print print "SCRUBBED SIZE OF PENDING GAME LIST:", len(pending_game_list) print return pending_game_list
def scrape_to_db(collection, app_id_list, count): ''' Attempt to scrape <count> reviews from each game in the <app_list> and then try to add the resulting dictionary to the provided <collection>. ''' # get list of apps/titles so we can populate the database with more data _gameindexer = GameIndexer() # step through each app in the list and try to scrape the reviews for app_id in app_id_list: # add try to make it more fault tolerant try: title = _gameindexer.return_game_title(app_id) # go get the game reviews game_results = get_game_reviews(app_id, 1150, title) insert(collection, game_results, "app_id") except Exception, e: error = "############################ Exception {} occurred! \n \ ############################ Scrape of {} failed".format( e, app_id) print error with open("ERROR_selenium_game_review_scrape.txt", "w") as _file: _file.write(error)
def print_filtered_predictions(self, train, test, num_items=10): ''' Remove the matches that are also in the train set Add *** HIT *** to predictions that match test set Limit printing to num_items lines ''' # keep track of how many lines have been printed printed = 0 # used for resolving appind to title names lookup = GameIndexer() train_apps = train.pop("appind") train_apps = [int(x) for x in train_apps] test_apps = test.pop("appind") test_apps = [int(x) for x in test_apps] for idx, result in enumerate(self.sorted_predictions): title = lookup.game_index_to_title(int(result[1]), 40) appind = int(result[1]) # print "\n\n**********************************************" # print "result", result[0], appind # print "train", sorted(list(train_apps)) # print " test", sorted(list(test_apps)) # print "**************************************************" if idx == 0: print "Rank: Prediction: Appind: Title:" hit = "" # import ipdb; ipdb.set_trace() # if the appind is not in the list of train apps then print it # there's no point in printing things from the train set! if appind not in train_apps: # if we got a match on the test set then we should make that # more obvious if appind in test_apps: hit = "<--- HIT *****" # make sure that we don't print more than we want printed += 1 print "{:2d} {:2.2f} {:5d} {:<40} {}". \ format(idx +1, result[0], int(result[1]), title, hit) if printed == num_items: break
def print_sorted_predictions(self): """ Print out the list for ease of testing """ lookup = GameIndexer() for idx, result in enumerate(self.sorted_predictions): title = lookup.game_index_to_title(int(result[1]), 40) if idx == 0: print "Rank: Prediction: Appind: Title:" print "{:2d} {:2.2f} {:5d} {}".format( idx + 1, result[0], int(result[1]), title)
def update_completed_games(app_id): ''' Append to the log file an app_id that has completed so that we don't waste time trying to scrape it again ''' with open("games_we_have.txt", "a") as outfile: outfile.write(app_id + "\n") temp = GameIndexer() remaining = len(temp.return_list_of_all_apps()) - len(get_completed_games()) print "added {} to games_we_have.txt with {} remaining.".format(app_id, remaining)
def load_game_reviews_into_table(collection): ''' Spark seems to ingest data via a big list and goes from there so make a dataframe that looks like user | app_id | rating (positive) ''' start_time = time.time() game_avgs = load_pandas_df("app_means_v3.csv") user_avgs = load_pandas_df("user_avgs.csv") ############################################################## ## Build dictionary to try to speed up lookups of weights #### ############################################################## # make dictionaries for different weights w_s1_dict = load_weights_to_dict("weights_s1", game_avgs) w_s2_dict = load_weights_to_dict("weights_s2", game_avgs) w_s3_dict = load_weights_to_dict("weights_s3", game_avgs) game_avg_dict = load_weights_to_dict("avg_log_min", game_avgs) # user_id : avg_playtime_log_m user_avg_dict = load_user_avgs_to_dict("avg_playtime_log_m", user_avgs) user_lookup_table = {} user_reverse_lookup_table = {} # get a GameIndexer ready for lookups indexer = GameIndexer() num_users = collection.find().count() # list to hold dictionaries before conversion to df data = [] for idx, user in enumerate(collection.find()): # keep track of users with reviews because the rest of # the users we have to go back and give 0's to #temp_user_list = [] # if idx > 10: # break _user = idx user_lookup_table[idx] = user["user"] user_reverse_lookup_table[user["user"]] = idx # try to keep track of time some _t = time.time() - start_time _ts = "{:2.2f}".format(_t)[:6] # completed in 46s with mod to reduce printing # even without the mod check it was 46s so no savings #if idx % 100 == 0: print "{}s ### {}th user of {} ###### \r".format(_ts, idx, num_users), for idy, playtime in enumerate(user["data"]): # if idy > 1000: # break _appid = indexer.app_id_to_game_index(int(playtime["appid"])) #get weighting of app from game_avgs dataframe. # get weighting of a certain app # pull the weight from the game_avgs dataframe #result = game_avgs[game_avgs["app_id"] == _appid]["weights_s1"] try: weight_s1 = w_s1_dict[_appid] weight_s2 = w_s2_dict[_appid] weight_s3 = w_s3_dict[_appid] #import pdb; pdb.set_trace() except Exception, e: #print "Item not in dictionary {} {} {} ".format(e, repr(_appid), type(_appid)) weight_s1 = 0.0 weight_s1 = 0.0 weight_s1 = 0.0 # if len(result) > 0: # weight = result.values[0] # # if weight >= 0: # # if weight < 1: # # print "weight seems good", weight # # elif weight < 0: # # print "############## Error, seems like it didn't match {} correctly".format(repr(_appid)) # # else: # # print "##############{} Error, seems like it didn't match {} correctly".format(weight, repr(_appid)) # else: # weight = 0.0 # if the weight is below zero then the game probably doesn't have any plays # (ie no data) # potentially modify this to log time because then the # distribution is normal # Goodnight sweet prince, going to log10 time now # _playtime_m = int(review["playtime_forever"]) _log_playtime_m = int(playtime["playtime_forever"]) if _log_playtime_m > 1: _log_playtime_m = np.log10(_log_playtime_m + 0.0001) else: _log_playtime_m = 0 _lpm_b0s1 = _log_playtime_m * weight_s1 _lpm_b0s2 = _log_playtime_m * weight_s2 _lpm_b0s3 = _log_playtime_m * weight_s3 # modify _log_playtime_m by the weighting of the app to # compensate for different app biases (ie low user count/high playtime) # or very high user counts _log_playtime_m data.append({ "appind": _appid, "user": _user, "lpm_b0_s0": _log_playtime_m, "lpm_b0_s1": _lpm_b0s1, "lpm_b0_s2": _lpm_b0s2, "lpm_b0_s3": _lpm_b0s3 })