def run_scrape_function_with_retries(scrape_function, date):
    num_attempts = 0

    while True:
        num_attempts += 1

        status = scrape_function(date)

        if status == 200:
            log.info('successful')
            break
        elif status == 404:
            log.info('file not found')
            break
        elif status == 'leaderboard updated':
            log.warning('the leaderboard was updated after this script was started, so re-run this script')
            break
        else:
            if num_attempts < 3:
                log.info('Status was %s, retrying', status)
            else:
                log.error('reached 3 attempts, aborting')
                break

    return status
def run_scrape_function_with_retries(scrape_function, date):
    num_attempts = 0

    while True:
        num_attempts += 1

        status = scrape_function(date)

        if status == 200:
            log.info('successful')
            break
        elif status == 404:
            log.info('file not found')
            break
        elif status == 'leaderboard updated':
            log.warning('the leaderboard was updated after this script was started, so re-run this script')
            break
        else:
            if num_attempts < 3:
                log.info('Status was %s, retrying', status)
            else:
                log.error('reached 3 attempts, aborting')
                break

    return status
Example #3
0
def main():
    filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$')
    iso_leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) &plusmn; ' + \
                                     r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \
                                     r'(?P<rank>\d+)</td><td class=c>' + \
                                     r'(?P<eligible_games_played>\d+)</td><td>' + \
                                     r'(?P<nickname>[^<]*) <')
    goko_leaderboard_pattern = re.compile(r'\s+<td class="leaders-table-item table-item-rank">(?P<rank>\d+)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-name"><img [^>]*>(?P<nickname>.*)</td>\s*\n' + r'\s*<td class="leaders-table-item table-item-points">(?P<skill_mean>\d+)</td>')

    database = utils.get_mongo_database()
    history_collection = database.leaderboard_history
    scanner_collection = database.scanner

    db_val = scanner_collection.find_one({'_id': 'leaderboard_history'})
    last_date = db_val['last_date'] if db_val else '0000-00-00'

    directory = 'static/leaderboard/'

    filenames = os.listdir(directory)
    filenames.sort()

    bad_leaderboard_dates = utils.get_bad_leaderboard_dates()

    for filename in filenames:
        match = filename_pattern.search(filename)
        if not match:
            continue

        date = match.group('date')

        if date in bad_leaderboard_dates:
            # don't load data from when the leaderboard was messed up
            log.warning("Skipping %s because the leaderboard was messed up", date)
            continue

        if date <= last_date:
            log.warning("Date %s is less than last date %s", date, last_date)
            continue

        log.info('Processing %s', date)

        file_obj = bz2.BZ2File(directory + filename)
        content = file_obj.read().decode('utf-8')
        file_obj.close()

        nickname_to_entry = {}
        num_matches = 0
        last_rank = -1

        pos = 0
        while True:
            match = iso_leaderboard_pattern.search(content, pos)
            if not match:
                break

            num_matches += 1
            skill_mean = float(match.group('skill_mean'))
            skill_error = float(match.group('skill_error'))
            rank = int(match.group('rank'))
            eligible_games_played = int(match.group('eligible_games_played'))
            nickname = match.group('nickname')

            normed_nickname = name_merger.norm_name(nickname)

            if normed_nickname not in nickname_to_entry:
                nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played]
            else:
                log.info('normed nickname %s already exists for %s', normed_nickname, date)

            last_rank = rank
            pos = match.end()

        pos = 0
        while True:
            match = goko_leaderboard_pattern.search(content, pos)
            if not match:
                break

            num_matches += 1
            skill_mean = float(match.group('skill_mean'))
            skill_error = 0
            rank = int(match.group('rank'))
            eligible_games_played = 0
            nickname = match.group('nickname')

            normed_nickname = nickname

            if normed_nickname not in nickname_to_entry:
                nickname_to_entry[normed_nickname] = [date, skill_mean, skill_error, rank, eligible_games_played]
            else:
                log.info('normed nickname %s already exists for %s', normed_nickname, date)

            last_rank = rank
            pos = match.end()
        log.info('%d entries matched', num_matches)

        if num_matches == 0:
            log.error('No entries found, so the regex is probably not doing its job anymore.')
            break

        if num_matches != last_rank:
            log.error('ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.')
            break

        for nickname, data in nickname_to_entry.iteritems():
            history_collection.update({'_id': nickname}, {'$push': {'history': data}}, upsert=True)

        log.info('%d player histories updated', len(nickname_to_entry))

        last_date = date

    scanner_collection.update({'_id': 'leaderboard_history'}, {'$set': {'last_date': last_date}}, upsert=True)
Example #4
0
def main(args):
    db = utils.get_mongo_database()
    games_collection = db.games
    output_collection = db.goals
    total_checked = 0

    checker_output = collections.defaultdict(int)

    if args.goals:
        valid_goals = True
        for goal_name in args.goals:
            if goal_name not in goal_check_funcs:
                valid_goals = False
                log.error("Unrecognized goal name '%s'", goal_name)
        if not valid_goals:
            exit(-1)
        goals_to_check = args.goals

        scanner = incremental_scanner.IncrementalScanner('subgoals', db)
        scanner.reset()
        main_scanner = incremental_scanner.IncrementalScanner('goals', db)
        last = main_scanner.get_max_game_id()
    else:
        goals_to_check = None
        scanner = incremental_scanner.IncrementalScanner('goals', db)
        last = None

    if not args.incremental:
        scanner.reset()
        output_collection.remove()
    output_collection.ensure_index('goals.player')

    log.info("Starting run: %s", scanner.status_msg())

    for g in utils.progress_meter(scanner.scan(games_collection, {})):
        total_checked += 1
        game_val = game.Game(g)

        # Get existing goal set (if exists)
        game_id = game_val.get_id()
        mongo_val = output_collection.find_one({'_id': game_id})

        if mongo_val is None:
            mongo_val = collections.defaultdict( dict )
            mongo_val['_id'] = game_id
            mongo_val['goals'] = []

        # If rechecking, delete old values
        if goals_to_check is not None:
            goals = mongo_val['goals']
            for ind in range(len(goals) - 1, -1, -1):
                goal = goals[ind]
                if goal['goal_name'] in goals_to_check:
                    del goals[ind]

        # Get new values
        goals = check_goals(game_val, goals_to_check)

        # Write new values
        for goal in goals:
            goal_name = goal['goal_name']
            mongo_val['goals'].append(goal)
            checker_output[goal_name] += 1

        mongo_val = dict(mongo_val)
        output_collection.save(mongo_val)

        if last and game_id == last:
            break
        if args.max_games >= 0 and total_checked >= args.max_games:
            break

    log.info("Ending run: %s", scanner.status_msg())
    scanner.save()
    print_totals(checker_output, total_checked)
Example #5
0
def main():
    filename_pattern = re.compile(r'^(?P<date>\d\d\d\d-\d\d-\d\d)\.html\.bz2$')
    leaderboard_pattern = re.compile(r'<td>(?P<skill_mean>-?\d+\.\d+) &plusmn; ' + \
                                     r'(?P<skill_error>-?\d+\.\d+)</td><td class=c2>' + \
                                     r'(?P<rank>\d+)</td><td class=c>' + \
                                     r'(?P<eligible_games_played>\d+)</td><td>' + \
                                     r'(?P<nickname>[^<]*) <')

    database = utils.get_mongo_database()
    history_collection = database.leaderboard_history
    scanner_collection = database.scanner

    db_val = scanner_collection.find_one({'_id': 'leaderboard_history'})
    last_date = db_val['last_date'] if db_val else '0000-00-00'

    directory = 'static/leaderboard/'

    filenames = os.listdir(directory)
    filenames.sort()

    bad_leaderboard_dates = utils.get_bad_leaderboard_dates()

    for filename in filenames:
        match = filename_pattern.search(filename)
        if not match:
            continue

        date = match.group('date')

        if date in bad_leaderboard_dates:
            # don't load data from when the leaderboard was messed up
            log.warning("Skipping %s because the leaderboard was messed up",
                        date)
            continue

        if date <= last_date:
            log.warning("Date %s is less than last date %s", date, last_date)
            continue

        log.info('Processing %s', date)

        file_obj = bz2.BZ2File(directory + filename)
        content = file_obj.read().decode('utf-8')
        file_obj.close()

        nickname_to_entry = {}
        num_matches = 0
        last_rank = -1

        pos = 0
        while True:
            match = leaderboard_pattern.search(content, pos)
            if not match:
                break

            num_matches += 1
            skill_mean = float(match.group('skill_mean'))
            skill_error = float(match.group('skill_error'))
            rank = int(match.group('rank'))
            eligible_games_played = int(match.group('eligible_games_played'))
            nickname = match.group('nickname')

            normed_nickname = name_merger.norm_name(nickname)

            if normed_nickname not in nickname_to_entry:
                nickname_to_entry[normed_nickname] = [
                    date, skill_mean, skill_error, rank, eligible_games_played
                ]
            else:
                log.info('normed nickname %s already exists for %s',
                         normed_nickname, date)

            last_rank = rank
            pos = match.end()

        log.info('%d entries matched', num_matches)

        if num_matches == 0:
            log.error(
                'No entries found, so the regex is probably not doing its job anymore.'
            )
            break

        if num_matches != last_rank:
            log.error(
                'ERROR: # entries does not match last rank, so the regex is probably not doing its job anymore.'
            )
            break

        for nickname, data in nickname_to_entry.iteritems():
            history_collection.update({'_id': nickname},
                                      {'$push': {
                                          'history': data
                                      }},
                                      upsert=True)

        log.info('%d player histories updated', len(nickname_to_entry))

        last_date = date

    scanner_collection.update({'_id': 'leaderboard_history'},
                              {'$set': {
                                  'last_date': last_date
                              }},
                              upsert=True)
Example #6
0
def main(args):
    db = utils.get_mongo_database()
    games_collection = db.games
    output_collection = db.goals
    total_checked = 0

    checker_output = collections.defaultdict(int)

    if args.goals:
        valid_goals = True
        for goal_name in args.goals:
            if goal_name not in goal_check_funcs:
                valid_goals = False
                log.error("Unrecognized goal name '%s'", goal_name)
        if not valid_goals:
            exit(-1)
        goals_to_check = args.goals

        scanner = incremental_scanner.IncrementalScanner('subgoals', db)
        scanner.reset()
        main_scanner = incremental_scanner.IncrementalScanner('goals', db)
        last = main_scanner.get_max_game_id()
    else:
        goals_to_check = None
        scanner = incremental_scanner.IncrementalScanner('goals', db)
        last = None

    if not args.incremental:
        scanner.reset()
        output_collection.remove()
    output_collection.ensure_index('goals.player')

    log.info("Starting run: %s", scanner.status_msg())

    for g in utils.progress_meter(scanner.scan(games_collection, {})):
        total_checked += 1
        game_val = game.Game(g)

        # Get existing goal set (if exists)
        game_id = game_val.get_id()
        mongo_val = output_collection.find_one({'_id': game_id})

        if mongo_val is None:
            mongo_val = collections.defaultdict(dict)
            mongo_val['_id'] = game_id
            mongo_val['goals'] = []

        # If rechecking, delete old values
        if goals_to_check is not None:
            goals = mongo_val['goals']
            for ind in range(len(goals) - 1, -1, -1):
                goal = goals[ind]
                if goal['goal_name'] in goals_to_check:
                    del goals[ind]

        # Get new values
        goals = check_goals(game_val, goals_to_check)

        # Write new values
        for goal in goals:
            goal_name = goal['goal_name']
            mongo_val['goals'].append(goal)
            checker_output[goal_name] += 1

        mongo_val = dict(mongo_val)
        output_collection.save(mongo_val)

        if last and game_id == last:
            break
        if args.max_games >= 0 and total_checked >= args.max_games:
            break

    log.info("Ending run: %s", scanner.status_msg())
    scanner.save()
    print_totals(checker_output, total_checked)