コード例 #1
0
def load_league_team_stats(league_game_stats_file):
    logger.info('Starting LeagueTeamStats ingestion: ' + league_game_stats_file)

    results = LeagueTeamStats.parse_game_team_data_file(league_game_stats_file)

    solr_url = SOLR_URL + LEAGUE_TEAM_STATS_CORE + 'update?commit=true'
    data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1')
    etl.postJsonDocToSolr(solr_url, data)

    logger.info('Finished LeagueTeamStats ingestion: ' + league_game_stats_file)
コード例 #2
0
def load_game_play_by_plays(game_play_by_play_dir):
    '''Load GamePlayByPlay data into a solr instance.

    :param game_play_by_play_dir: Directory containing XDATA NBA
        GamePlayByPlay JSON files to load into a Solr instance.
    '''
    logger.info('Starting GamePlayByPlay ingestion: ' + game_play_by_play_dir)

    # Get a list of all the files we need to load
    data_files = [
        os.path.join(game_play_by_play_dir, f)
        for f in os.listdir(game_play_by_play_dir)
        if os.path.isfile(os.path.join(game_play_by_play_dir, f))
    ]

    # Determine the number of threads it will take to process that many files
    total_threads = len(data_files) / GAME_PLAY_BY_PLAY_FILES_PER_THREAD
    # If the number of files isn't evenly divisible by the number of files
    # per thread that we want to use we need to compensate for the remainder.
    total_threads += 1 if len(
        data_files) % GAME_PLAY_BY_PLAY_FILES_PER_THREAD else 0

    # Split the data files into chunks to pass to each thread.
    fpt = GAME_PLAY_BY_PLAY_FILES_PER_THREAD
    split_data_files = [
        data_files[(fpt * index):(fpt * index) + fpt]
        for index in range(total_threads)
    ]

    # Process all the files!
    thread_pool = multiprocessing.Pool(total_threads)
    results = thread_pool.map(load_game_play_by_play_files, split_data_files)
    thread_pool.close()
    thread_pool.join()

    # Join result set here
    results = list(itertools.chain.from_iterable(results))

    solr_url = SOLR_URL + GAME_PLAY_BY_PLAY_CORE + 'update?commit=true'
    num_splits = 10
    files_per_split = int(len(results)) / num_splits
    for i in range(num_splits + 1):
        logger.info("Sending PlayByPlay split #" + str(i))
        if i == num_splits:
            data = json.dumps([results[-1]])
        else:
            data = json.dumps(results[i * files_per_split:(i + 1) *
                                      files_per_split])

        etl.postJsonDocToSolr(solr_url, data)

    logger.info('GamePlayByPlay ingestions complete')
コード例 #3
0
def load_game_comments(game_comments_dir):
    '''Load GameComments data into a solr instance.

    :param game_comments_dir: Directory containing XDATA NBA GameComments JSON
        files to load into a Solr instance.
    '''
    logger.info('Starting GameComments ingestion: ' + game_comments_dir)

    # Train the sentiment analyser that we'll use when processing
    # all the game comments.
    logger.info('Training sentiment analyser for comment ingestion')
    SentimentAnalyser.train()
    logger.info('Sentiment analyser training complete')

    # Get a list of all the files we need to load
    data_files = [
        os.path.join(game_comments_dir, f)
        for f in os.listdir(game_comments_dir)
        if os.path.isfile(os.path.join(game_comments_dir, f))
    ]

    # Determine the number of threads it will take to process that many files
    total_threads = len(data_files) / GAME_COMMENTS_FILES_PER_THREAD
    # If the number of files isn't evenly divisible by the number of files
    # per thread that we want to use we need to compensate for the remainder.
    total_threads += 1 if len(
        data_files) % GAME_COMMENTS_FILES_PER_THREAD else 0

    # Split the data files into chunks to pass to each thread.
    fpt = GAME_COMMENTS_FILES_PER_THREAD
    split_data_files = [
        data_files[(fpt * index):(fpt * index) + fpt]
        for index in range(total_threads)
    ]

    # Process all the files!
    thread_pool = multiprocessing.Pool(total_threads)
    results = thread_pool.map(load_game_comments_files, split_data_files)
    thread_pool.close()
    thread_pool.join()

    # Join result set here
    results = list(itertools.chain.from_iterable(results))

    # Send single hit to Solr here
    solr_url = SOLR_URL + GAME_COMMENTS_CORE + 'update?commit=true'
    data = etl.prepareDocsForSolr(results,
                                  unmarshall=False,
                                  encoding='latin-1')
    etl.postJsonDocToSolr(solr_url, data)

    logger.info('GameComments ingestions complete')
コード例 #4
0
def load_game_play_by_plays(game_play_by_play_dir):
    '''Load GamePlayByPlay data into a solr instance.

    :param game_play_by_play_dir: Directory containing XDATA NBA
        GamePlayByPlay JSON files to load into a Solr instance.
    '''
    logger.info('Starting GamePlayByPlay ingestion: ' + game_play_by_play_dir)

    # Get a list of all the files we need to load
    data_files = [os.path.join(game_play_by_play_dir, f)
                  for f in os.listdir(game_play_by_play_dir)
                  if os.path.isfile(os.path.join(game_play_by_play_dir, f))]

    # Determine the number of threads it will take to process that many files
    total_threads = len(data_files) / GAME_PLAY_BY_PLAY_FILES_PER_THREAD
    # If the number of files isn't evenly divisible by the number of files
    # per thread that we want to use we need to compensate for the remainder.
    total_threads += 1 if len(data_files) % GAME_PLAY_BY_PLAY_FILES_PER_THREAD else 0

    # Split the data files into chunks to pass to each thread.
    fpt = GAME_PLAY_BY_PLAY_FILES_PER_THREAD
    split_data_files = [data_files[(fpt * index):(fpt * index) + fpt]
                        for index in range(total_threads)]

    # Process all the files!
    thread_pool = multiprocessing.Pool(total_threads)
    results = thread_pool.map(load_game_play_by_play_files, split_data_files)
    thread_pool.close()
    thread_pool.join()

    # Join result set here
    results = list(itertools.chain.from_iterable(results))

    solr_url = SOLR_URL + GAME_PLAY_BY_PLAY_CORE + 'update?commit=true'
    num_splits = 10
    files_per_split = int(len(results)) / num_splits
    for i in range(num_splits + 1):
        logger.info("Sending PlayByPlay split #" + str(i))
        if i == num_splits:
            data = json.dumps([results[-1]])
        else:
            data = json.dumps(results[i*files_per_split:(i+1)*files_per_split])

        etl.postJsonDocToSolr(solr_url, data)

    logger.info('GamePlayByPlay ingestions complete')
コード例 #5
0
def load_game_comments(game_comments_dir):
    '''Load GameComments data into a solr instance.

    :param game_comments_dir: Directory containing XDATA NBA GameComments JSON
        files to load into a Solr instance.
    '''
    logger.info('Starting GameComments ingestion: ' + game_comments_dir)

    # Train the sentiment analyser that we'll use when processing
    # all the game comments.
    logger.info('Training sentiment analyser for comment ingestion')
    SentimentAnalyser.train()
    logger.info('Sentiment analyser training complete')

    # Get a list of all the files we need to load
    data_files = [os.path.join(game_comments_dir, f)
                  for f in os.listdir(game_comments_dir)
                  if os.path.isfile(os.path.join(game_comments_dir, f))]

    # Determine the number of threads it will take to process that many files
    total_threads = len(data_files) / GAME_COMMENTS_FILES_PER_THREAD
    # If the number of files isn't evenly divisible by the number of files
    # per thread that we want to use we need to compensate for the remainder.
    total_threads += 1 if len(data_files) % GAME_COMMENTS_FILES_PER_THREAD else 0

    # Split the data files into chunks to pass to each thread.
    fpt = GAME_COMMENTS_FILES_PER_THREAD
    split_data_files = [data_files[(fpt * index):(fpt * index) + fpt]
                        for index in range(total_threads)]

    # Process all the files!
    thread_pool = multiprocessing.Pool(total_threads)
    results = thread_pool.map(load_game_comments_files, split_data_files)
    thread_pool.close()
    thread_pool.join()

    # Join result set here
    results = list(itertools.chain.from_iterable(results))

    # Send single hit to Solr here
    solr_url = SOLR_URL + GAME_COMMENTS_CORE + 'update?commit=true'
    data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1')
    etl.postJsonDocToSolr(solr_url, data)

    logger.info('GameComments ingestions complete')
コード例 #6
0
def load_game_stats(game_stats_dir):
    '''Load GameStats data into a solr instance.

    :param game_stats_dir: Directory containing XDATA NBA
        GameStats JSON files to load into a Solr instance.
    '''
    logger.info('Starting GameStats ingestion: ' + game_stats_dir)

    # Get a list of all the files we need to load
    data_files = [os.path.join(game_stats_dir, f)
                  for f in os.listdir(game_stats_dir)
                  if os.path.isfile(os.path.join(game_stats_dir, f))]

    # Determine the number of threads it will take to process that many files
    total_threads = len(data_files) / GAME_STATS_FILES_PER_THREAD
    # If the number of files isn't evenly divisible by the number of files
    # per thread that we want to use we need to compensate for the remainder.
    total_threads += 1 if len(data_files) % GAME_STATS_FILES_PER_THREAD else 0

    # Split the data files into chunks to pass to each thread.
    fpt = GAME_STATS_FILES_PER_THREAD
    split_data_files = [data_files[(fpt * index):(fpt * index) + fpt]
                        for index in range(total_threads)]

    # Process all the files!
    thread_pool = multiprocessing.Pool(total_threads)
    results = thread_pool.map(load_game_stats_files, split_data_files)
    thread_pool.close()
    thread_pool.join()

    # Join result set here
    results = list(itertools.chain.from_iterable(results))

    solr_url = SOLR_URL + GAME_STATS_CORE + 'update?commit=true'
    data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1')
    etl.postJsonDocToSolr(solr_url, data)

    logger.info('GameStats ingestions complete')
    logger.info('Processing GameStats results for GameResults core update')
    sorted_results = sorted(results, key=lambda x: x['game_id'])

    game_results = []
    for team1, team2 in zip(sorted_results[::2], sorted_results[1::2]):
        if team1['game_id'] != team2['game_id']:
            err = (
                'Invalid records. Game_ids don\'t match. '
                'Indices: {} and {}'
            ).format(i, i+1)
            logger.critical(err)
            continue

        if team1['pts'] > team2['pts']:
            winner, loser = team1, team2
        elif team2['pts'] > team1['pts']:
            winner, loser = team2, team1
        else:
            err = (
                'Game resulted in a tie: game_id_1 {} game_id_2 {} '
                'team_1_id {} team_2_id {}'
            ).format(team1['game_id'], team2['game_id'],
                     team1['team_id'], team2['team_id'])
            logger.critical(err)
            continue

        game_results.append({
            'id': winner['game_id'],
            'game_id': winner['game_id'],
            'winner_id': winner['team_id'],
            'loser_id': loser['team_id'],
        })

    solr_url = SOLR_URL + GAME_RESULTS_CORE + 'update?commit=true'
    data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1')
    etl.postJsonDocToSolr(solr_url, data)

    logger.info('GameResults ingestion complete')
コード例 #7
0
def load_commentary(commentary_dirs):
    '''Load commentary data into a solr instance.

    :param commentary_dirs: A dict containing commentary directory
        information for loading into solr. Assuming the standard
        NBA data directory, commentary_dirs should contain keys
        listing the matching folder names for 'notebook', 'preview',
        and 'recaps'.
    '''
    logger.info('Starting Commentary ingestion: ' + str(commentary_dirs))

    preview_dir = commentary_dirs['preview']
    recap_dir = commentary_dirs['recap']
    notebook_dir = commentary_dirs['notebook']

    # Get lists of all the files we need to load
    preview_files = [os.path.join(preview_dir, f)
                  for f in os.listdir(preview_dir)
                  if os.path.isfile(os.path.join(preview_dir, f))]
    preview_files.sort()

    recap_files = [os.path.join(recap_dir, f)
                  for f in os.listdir(recap_dir)
                  if os.path.isfile(os.path.join(recap_dir, f))]
    recap_files.sort()

    notebook_files = [os.path.join(notebook_dir, f)
                  for f in os.listdir(notebook_dir)
                  if os.path.isfile(os.path.join(notebook_dir, f))]
    notebook_files.sort()

    # Determine the number of threads it will take to process that many files
    total_threads = len(preview_files) / GAME_COMMENTARY_FILES_PER_THREAD
    # If the number of files isn't evenly divisible by the number of files
    # per thread that we want to use we need to compensate for the remainder.
    total_threads += 1 if len(preview_files) % GAME_COMMENTARY_FILES_PER_THREAD else 0

    # Split the data files into chunks to pass to each thread.
    fpt = GAME_COMMENTARY_FILES_PER_THREAD
    split_preview_files = [preview_files[(fpt * index):(fpt * index) + fpt]
                           for index in range(total_threads)]
    split_recap_files = [recap_files[(fpt * index):(fpt * index) + fpt]
                        for index in range(total_threads)]
    split_notebook_files = [notebook_files[(fpt * index):(fpt * index) + fpt]
                            for index in range(total_threads)]

    split_data_files = [(split_preview_files[i], split_recap_files[i], split_notebook_files[i])
                       for i in range(len(split_preview_files))]

    # Process all the files!
    thread_pool = multiprocessing.Pool(total_threads)
    results = thread_pool.map(load_commentary_files, split_data_files)
    thread_pool.close()
    thread_pool.join()

    # Join result set
    results = list(itertools.chain.from_iterable(results))

    solr_url = SOLR_URL + GAME_COMMENTARY_CORE + 'update?commit=true'
    data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1')
    etl.postJsonDocToSolr(solr_url, data)

    logger.info('Commentary ingestions complete')