def run_scrape_function_with_retries(scrape_function, date): num_attempts = 0 while True: num_attempts += 1 status = scrape_function(date) if status == 200: log.info('successful') break elif status == 404: log.info('file not found') break elif status == 'leaderboard updated': log.warning('the leaderboard was updated after this script was started, so re-run this script') break else: if num_attempts < 3: log.info('Status was %s, retrying', status) else: log.error('reached 3 attempts, aborting') break return status
def main(): filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$') iso_leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) ± ' + \ r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \ r'(?P<rank>\d+)</td><td class=c>' + \ r'(?P<eligible_games_played>\d+)</td><td>' + \ r'(?P<nickname>[^<]*) <') goko_leaderboard_pattern = re.compile(r'\s+<td class="leaders-table-item table-item-rank">(?P<rank>\d+)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-name"><img [^>]*>(?P<nickname>.*)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-points">(?P<skill_mean>\d+)</td>') database = utils.get_mongo_database() history_collection = database.leaderboard_history scanner_collection = database.scanner db_val = scanner_collection.find_one({'_id': 'leaderboard_history'}) last_date = db_val['last_date'] if db_val else '0000-00-00' directory = 'static/leaderboard/' filenames = os.listdir(directory) filenames.sort() bad_leaderboard_dates = utils.get_bad_leaderboard_dates() for filename in filenames: match = filename_pattern.search(filename) if not match: continue date = match.group('date') if date in bad_leaderboard_dates: # don't load data from when the leaderboard was messed up log.warning("Skipping %s because the leaderboard was messed up", date) continue if date <= last_date: log.warning("Date %s is less than last date %s", date, last_date) continue log.info('Processing %s', date) file_obj = bz2.BZ2File(directory + filename) content = file_obj.read().decode('utf-8') file_obj.close() nickname_to_entry = {} num_matches = 0 last_rank = -1 pos = 0 while True: match = iso_leaderboard_pattern.search(content, pos) if not match: break num_matches += 1 skill_mean = float(match.group('skill_mean')) skill_error = float(match.group('skill_error')) rank = int(match.group('rank')) eligible_games_played = int(match.group('eligible_games_played')) nickname = match.group('nickname') normed_nickname = name_merger.norm_name(nickname) if normed_nickname not in nickname_to_entry: nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played] else: log.info('normed nickname %s already exists for %s', normed_nickname, date) last_rank = rank pos = match.end() pos = 0 while True: match = goko_leaderboard_pattern.search(content, pos) if not match: break num_matches += 1 skill_mean = float(match.group('skill_mean')) skill_error = 0 rank = int(match.group('rank')) eligible_games_played = 0 nickname = match.group('nickname') normed_nickname = nickname if normed_nickname not in nickname_to_entry: nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played] else: log.info('normed nickname %s already exists for %s', normed_nickname, date) last_rank = rank pos = match.end() log.info('%d entries matched', num_matches) if num_matches == 0: log.error('No entries found, so the regex is probably not doing its job anymore.') break if num_matches != last_rank: log.error('ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.') break for nickname, data in nickname_to_entry.iteritems(): history_collection.update({'_id': nickname}, {'$push': {'history': data}}, upsert=True) log.info('%d player histories updated', len(nickname_to_entry)) last_date = date scanner_collection.update({'_id': 'leaderboard_history'}, {'$set': {'last_date': last_date}}, upsert=True)
def main(args): db = utils.get_mongo_database() games_collection = db.games output_collection = db.goals total_checked = 0 checker_output = collections.defaultdict(int) if args.goals: valid_goals = True for goal_name in args.goals: if goal_name not in goal_check_funcs: valid_goals = False log.error("Unrecognized goal name '%s'", goal_name) if not valid_goals: exit(-1) goals_to_check = args.goals scanner = incremental_scanner.IncrementalScanner('subgoals', db) scanner.reset() main_scanner = incremental_scanner.IncrementalScanner('goals', db) last = main_scanner.get_max_game_id() else: goals_to_check = None scanner = incremental_scanner.IncrementalScanner('goals', db) last = None if not args.incremental: scanner.reset() output_collection.remove() output_collection.ensure_index('goals.player') log.info("Starting run: %s", scanner.status_msg()) for g in utils.progress_meter(scanner.scan(games_collection, {})): total_checked += 1 game_val = game.Game(g) # Get existing goal set (if exists) game_id = game_val.get_id() mongo_val = output_collection.find_one({'_id': game_id}) if mongo_val is None: mongo_val = collections.defaultdict( dict ) mongo_val['_id'] = game_id mongo_val['goals'] = [] # If rechecking, delete old values if goals_to_check is not None: goals = mongo_val['goals'] for ind in range(len(goals) - 1, -1, -1): goal = goals[ind] if goal['goal_name'] in goals_to_check: del goals[ind] # Get new values goals = check_goals(game_val, goals_to_check) # Write new values for goal in goals: goal_name = goal['goal_name'] mongo_val['goals'].append(goal) checker_output[goal_name] += 1 mongo_val = dict(mongo_val) output_collection.save(mongo_val) if last and game_id == last: break if args.max_games >= 0 and total_checked >= args.max_games: break log.info("Ending run: %s", scanner.status_msg()) scanner.save() print_totals(checker_output, total_checked)
def main(): filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$') leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) ± ' + \ r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \ r'(?P<rank>\d+)</td><td class=c>' + \ r'(?P<eligible_games_played>\d+)</td><td>' + \ r'(?P<nickname>[^<]*) <') database = utils.get_mongo_database() history_collection = database.leaderboard_history scanner_collection = database.scanner db_val = scanner_collection.find_one({'_id': 'leaderboard_history'}) last_date = db_val['last_date'] if db_val else '0000-00-00' directory = 'static/leaderboard/' filenames = os.listdir(directory) filenames.sort() bad_leaderboard_dates = utils.get_bad_leaderboard_dates() for filename in filenames: match = filename_pattern.search(filename) if not match: continue date = match.group('date') if date in bad_leaderboard_dates: # don't load data from when the leaderboard was messed up log.warning("Skipping %s because the leaderboard was messed up", date) continue if date <= last_date: log.warning("Date %s is less than last date %s", date, last_date) continue log.info('Processing %s', date) file_obj = bz2.BZ2File(directory + filename) content = file_obj.read().decode('utf-8') file_obj.close() nickname_to_entry = {} num_matches = 0 last_rank = -1 pos = 0 while True: match = leaderboard_pattern.search(content, pos) if not match: break num_matches += 1 skill_mean = float(match.group('skill_mean')) skill_error = float(match.group('skill_error')) rank = int(match.group('rank')) eligible_games_played = int(match.group('eligible_games_played')) nickname = match.group('nickname') normed_nickname = name_merger.norm_name(nickname) if normed_nickname not in nickname_to_entry: nickname_to_entry[normed_nickname] = [ date, skill_mean, skill_error, rank, eligible_games_played ] else: log.info('normed nickname %s already exists for %s', normed_nickname, date) last_rank = rank pos = match.end() log.info('%d entries matched', num_matches) if num_matches == 0: log.error( 'No entries found, so the regex is probably not doing its job anymore.' ) break if num_matches != last_rank: log.error( 'ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.' ) break for nickname, data in nickname_to_entry.iteritems(): history_collection.update({'_id': nickname}, {'$push': { 'history': data }}, upsert=True) log.info('%d player histories updated', len(nickname_to_entry)) last_date = date scanner_collection.update({'_id': 'leaderboard_history'}, {'$set': { 'last_date': last_date }}, upsert=True)
def main(args): db = utils.get_mongo_database() games_collection = db.games output_collection = db.goals total_checked = 0 checker_output = collections.defaultdict(int) if args.goals: valid_goals = True for goal_name in args.goals: if goal_name not in goal_check_funcs: valid_goals = False log.error("Unrecognized goal name '%s'", goal_name) if not valid_goals: exit(-1) goals_to_check = args.goals scanner = incremental_scanner.IncrementalScanner('subgoals', db) scanner.reset() main_scanner = incremental_scanner.IncrementalScanner('goals', db) last = main_scanner.get_max_game_id() else: goals_to_check = None scanner = incremental_scanner.IncrementalScanner('goals', db) last = None if not args.incremental: scanner.reset() output_collection.remove() output_collection.ensure_index('goals.player') log.info("Starting run: %s", scanner.status_msg()) for g in utils.progress_meter(scanner.scan(games_collection, {})): total_checked += 1 game_val = game.Game(g) # Get existing goal set (if exists) game_id = game_val.get_id() mongo_val = output_collection.find_one({'_id': game_id}) if mongo_val is None: mongo_val = collections.defaultdict(dict) mongo_val['_id'] = game_id mongo_val['goals'] = [] # If rechecking, delete old values if goals_to_check is not None: goals = mongo_val['goals'] for ind in range(len(goals) - 1, -1, -1): goal = goals[ind] if goal['goal_name'] in goals_to_check: del goals[ind] # Get new values goals = check_goals(game_val, goals_to_check) # Write new values for goal in goals: goal_name = goal['goal_name'] mongo_val['goals'].append(goal) checker_output[goal_name] += 1 mongo_val = dict(mongo_val) output_collection.save(mongo_val) if last and game_id == last: break if args.max_games >= 0 and total_checked >= args.max_games: break log.info("Ending run: %s", scanner.status_msg()) scanner.save() print_totals(checker_output, total_checked)