Ejemplo n.º 1
0
def parse_days(days):
    """Parses rawgames into games records and stores them in the DB.

    Takes a list of one or more days in the format "YYYYMMDD" or
    datetime.date, and generates tasks to parse the games that
    occurred on those days.

    Skips days where there are no rawgames available.

    Skips days where the parsed game collection has more than 65% of
    the quantity of rawgames, as this suggests the date has already
    been parsed.

    Returns the number of individual games referred for parsing.
    """
    game_count = 0
    db = utils.get_mongo_database()
    raw_games_col = db.raw_games
    games_col = db.games
    raw_games_col.ensure_index('game_date')

    for day in days:
        if type(day) is datetime.date:
            day = day.strftime('%Y%m%d')
        games_to_parse = raw_games_col.find({'game_date': day}, {'_id': 1})

        raw_games_qty = games_to_parse.count()
        if raw_games_qty < 1:
            log.info('no games to parse in %s', day)
            continue

        parsed_games_qty = games_col.find({'game_date': day}).count()
        if float(parsed_games_qty) / float(raw_games_qty) > 0.65:
            log.info(
                'Looks like raw games for %s have already been parsed. Found %5.2f%% in games collection.',
                day, 100.0 * parsed_games_qty / raw_games_qty)
            continue

        game_count += games_to_parse.count()
        log.info('%s games to parse in %s', games_to_parse.count(), day)
        for chunk in utils.segments([x['_id'] for x in games_to_parse],
                                    PARSE_GAMES_CHUNK_SIZE):
            parse_games.delay(chunk, day)

    return game_count
Ejemplo n.º 2
0
def parse_days(days):
    """Parses rawgames into games records and stores them in the DB.

    Takes a list of one or more days in the format "YYYYMMDD" or
    datetime.date, and generates tasks to parse the games that
    occurred on those days.

    Skips days where there are no rawgames available.

    Skips days where the parsed game collection has more than 95% of
    the quantity of rawgames, as this suggests the date has already
    been parsed.

    Returns the number of individual games referred for parsing.
    """
    game_count = 0
    db = utils.get_mongo_database()
    raw_games_col = db.raw_games
    games_col = db.games
    raw_games_col.ensure_index('game_date')

    for day in days:
        if type(day) is datetime.date:
            day = day.strftime('%Y%m%d')
        games_to_parse = raw_games_col.find({'game_date': day}, {'_id': 1})

        raw_games_qty = games_to_parse.count()
        if raw_games_qty < 1:
            log.info('no games to parse in %s', day)
            continue

        parsed_games_qty = games_col.find({'game_date': day}).count()
        if float(parsed_games_qty) / float(raw_games_qty) > 0.85:
            log.info('Looks like raw games for %s have already been parsed. Found %5.2f%% in games collection.',
                     day, 100.0 * parsed_games_qty / raw_games_qty)
            continue

        game_count += games_to_parse.count()
        log.info('%s games to parse in %s', games_to_parse.count(), day)
        for chunk in utils.segments([x['_id'] for x in games_to_parse], PARSE_GAMES_CHUNK_SIZE):
            parse_games.delay(chunk, day) 

    return game_count
Ejemplo n.º 3
0
def convert_to_json(log, raw_games, year_month_day, game_list=None):
    """ Parse the games in for given year_month_day and output them
    into split local files.  Each local file should contain 4000 games or
    less, and be smaller than 16 MB, for easy import into mongodb.

    year_month_day: string in yyyymmdd format encoding date
    games_to_parse: if given, use these games rather than all files in dir.
    """
    if game_list is None:
        games_to_parse = raw_games.find({'game_date': year_month_day})
    else:
        # TODO: Enhance this to accept a list of games
        log.warning(
            "covert_to_json not able to parse subset of games, parsing the full day"
        )
        games_to_parse = raw_games.find({'game_date': year_month_day})

    if games_to_parse.count() < 1:
        log.info('no games to parse in %s', year_month_day)
        return
    else:
        log.info('%s games to parse in %s', games_to_parse.count(),
                 year_month_day)

    # TODO: Temporarily commented out the Pool-based implementation
    #pool = multiprocessing.Pool()
    #parsed_games = pool.map(outer_parse_game, games_to_parse, chunksize=50)
    parsed_games = map(lambda x: parse_game_from_dict(log, x), games_to_parse)
    log.debug('%s before filtering %s', year_month_day, len(parsed_games))
    parsed_games = [x for x in parsed_games if x]

    track_brokenness(log, parsed_games)

    log.debug('%s after filtering %s', year_month_day, len(parsed_games))

    game_segments = list(segments(parsed_games, 4000))
    labelled_segments = [(i, year_month_day, c)
                         for i, c in enumerate(game_segments)]
    #pool.map(dump_segment, labelled_segments)
    map(dump_segment, labelled_segments)
Ejemplo n.º 4
0
def convert_to_json(log, raw_games, year_month_day, game_list=None):
    """ Parse the games in for given year_month_day and output them
    into split local files.  Each local file should contain 4000 games or
    less, and be smaller than 16 MB, for easy import into mongodb.

    year_month_day: string in yyyymmdd format encoding date
    games_to_parse: if given, use these games rather than all files in dir.
    """
    if game_list is None:
        games_to_parse = raw_games.find({'game_date': year_month_day})
    else:
        # TODO: Enhance this to accept a list of games
        log.warning("covert_to_json not able to parse subset of games, parsing the full day")
        games_to_parse = raw_games.find({'game_date': year_month_day})

    if games_to_parse.count() < 1:
        log.info('no games to parse in %s', year_month_day)
        return
    else:
        log.info('%s games to parse in %s', games_to_parse.count(), year_month_day)

    # TODO: Temporarily commented out the Pool-based implementation
    #pool = multiprocessing.Pool()
    #parsed_games = pool.map(outer_parse_game, games_to_parse, chunksize=50)
    parsed_games = map(lambda x: parse_game_from_dict(log, x), games_to_parse)
    log.debug('%s before filtering %s', year_month_day, len(parsed_games))
    parsed_games = [x for x in parsed_games if x]

    track_brokenness(log, parsed_games)

    log.debug('%s after filtering %s', year_month_day, len(parsed_games))

    game_segments = list(segments(parsed_games, 4000))
    labelled_segments = [(i, year_month_day, c) for i, c in
                         enumerate(game_segments)]
    #pool.map(dump_segment, labelled_segments)
    map(dump_segment, labelled_segments)