def load_league_team_stats(league_game_stats_file): logger.info('Starting LeagueTeamStats ingestion: ' + league_game_stats_file) results = LeagueTeamStats.parse_game_team_data_file(league_game_stats_file) solr_url = SOLR_URL + LEAGUE_TEAM_STATS_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('Finished LeagueTeamStats ingestion: ' + league_game_stats_file)
def load_game_play_by_plays(game_play_by_play_dir): '''Load GamePlayByPlay data into a solr instance. :param game_play_by_play_dir: Directory containing XDATA NBA GamePlayByPlay JSON files to load into a Solr instance. ''' logger.info('Starting GamePlayByPlay ingestion: ' + game_play_by_play_dir) # Get a list of all the files we need to load data_files = [ os.path.join(game_play_by_play_dir, f) for f in os.listdir(game_play_by_play_dir) if os.path.isfile(os.path.join(game_play_by_play_dir, f)) ] # Determine the number of threads it will take to process that many files total_threads = len(data_files) / GAME_PLAY_BY_PLAY_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len( data_files) % GAME_PLAY_BY_PLAY_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_PLAY_BY_PLAY_FILES_PER_THREAD split_data_files = [ data_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads) ] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_game_play_by_play_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set here results = list(itertools.chain.from_iterable(results)) solr_url = SOLR_URL + GAME_PLAY_BY_PLAY_CORE + 'update?commit=true' num_splits = 10 files_per_split = int(len(results)) / num_splits for i in range(num_splits + 1): logger.info("Sending PlayByPlay split #" + str(i)) if i == num_splits: data = json.dumps([results[-1]]) else: data = json.dumps(results[i * files_per_split:(i + 1) * files_per_split]) etl.postJsonDocToSolr(solr_url, data) logger.info('GamePlayByPlay ingestions complete')
def load_game_comments(game_comments_dir): '''Load GameComments data into a solr instance. :param game_comments_dir: Directory containing XDATA NBA GameComments JSON files to load into a Solr instance. ''' logger.info('Starting GameComments ingestion: ' + game_comments_dir) # Train the sentiment analyser that we'll use when processing # all the game comments. logger.info('Training sentiment analyser for comment ingestion') SentimentAnalyser.train() logger.info('Sentiment analyser training complete') # Get a list of all the files we need to load data_files = [ os.path.join(game_comments_dir, f) for f in os.listdir(game_comments_dir) if os.path.isfile(os.path.join(game_comments_dir, f)) ] # Determine the number of threads it will take to process that many files total_threads = len(data_files) / GAME_COMMENTS_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len( data_files) % GAME_COMMENTS_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_COMMENTS_FILES_PER_THREAD split_data_files = [ data_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads) ] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_game_comments_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set here results = list(itertools.chain.from_iterable(results)) # Send single hit to Solr here solr_url = SOLR_URL + GAME_COMMENTS_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('GameComments ingestions complete')
def load_game_play_by_plays(game_play_by_play_dir): '''Load GamePlayByPlay data into a solr instance. :param game_play_by_play_dir: Directory containing XDATA NBA GamePlayByPlay JSON files to load into a Solr instance. ''' logger.info('Starting GamePlayByPlay ingestion: ' + game_play_by_play_dir) # Get a list of all the files we need to load data_files = [os.path.join(game_play_by_play_dir, f) for f in os.listdir(game_play_by_play_dir) if os.path.isfile(os.path.join(game_play_by_play_dir, f))] # Determine the number of threads it will take to process that many files total_threads = len(data_files) / GAME_PLAY_BY_PLAY_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len(data_files) % GAME_PLAY_BY_PLAY_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_PLAY_BY_PLAY_FILES_PER_THREAD split_data_files = [data_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads)] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_game_play_by_play_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set here results = list(itertools.chain.from_iterable(results)) solr_url = SOLR_URL + GAME_PLAY_BY_PLAY_CORE + 'update?commit=true' num_splits = 10 files_per_split = int(len(results)) / num_splits for i in range(num_splits + 1): logger.info("Sending PlayByPlay split #" + str(i)) if i == num_splits: data = json.dumps([results[-1]]) else: data = json.dumps(results[i*files_per_split:(i+1)*files_per_split]) etl.postJsonDocToSolr(solr_url, data) logger.info('GamePlayByPlay ingestions complete')
def load_game_comments(game_comments_dir): '''Load GameComments data into a solr instance. :param game_comments_dir: Directory containing XDATA NBA GameComments JSON files to load into a Solr instance. ''' logger.info('Starting GameComments ingestion: ' + game_comments_dir) # Train the sentiment analyser that we'll use when processing # all the game comments. logger.info('Training sentiment analyser for comment ingestion') SentimentAnalyser.train() logger.info('Sentiment analyser training complete') # Get a list of all the files we need to load data_files = [os.path.join(game_comments_dir, f) for f in os.listdir(game_comments_dir) if os.path.isfile(os.path.join(game_comments_dir, f))] # Determine the number of threads it will take to process that many files total_threads = len(data_files) / GAME_COMMENTS_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len(data_files) % GAME_COMMENTS_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_COMMENTS_FILES_PER_THREAD split_data_files = [data_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads)] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_game_comments_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set here results = list(itertools.chain.from_iterable(results)) # Send single hit to Solr here solr_url = SOLR_URL + GAME_COMMENTS_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('GameComments ingestions complete')
def load_game_stats(game_stats_dir): '''Load GameStats data into a solr instance. :param game_stats_dir: Directory containing XDATA NBA GameStats JSON files to load into a Solr instance. ''' logger.info('Starting GameStats ingestion: ' + game_stats_dir) # Get a list of all the files we need to load data_files = [os.path.join(game_stats_dir, f) for f in os.listdir(game_stats_dir) if os.path.isfile(os.path.join(game_stats_dir, f))] # Determine the number of threads it will take to process that many files total_threads = len(data_files) / GAME_STATS_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len(data_files) % GAME_STATS_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_STATS_FILES_PER_THREAD split_data_files = [data_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads)] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_game_stats_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set here results = list(itertools.chain.from_iterable(results)) solr_url = SOLR_URL + GAME_STATS_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('GameStats ingestions complete') logger.info('Processing GameStats results for GameResults core update') sorted_results = sorted(results, key=lambda x: x['game_id']) game_results = [] for team1, team2 in zip(sorted_results[::2], sorted_results[1::2]): if team1['game_id'] != team2['game_id']: err = ( 'Invalid records. Game_ids don\'t match. ' 'Indices: {} and {}' ).format(i, i+1) logger.critical(err) continue if team1['pts'] > team2['pts']: winner, loser = team1, team2 elif team2['pts'] > team1['pts']: winner, loser = team2, team1 else: err = ( 'Game resulted in a tie: game_id_1 {} game_id_2 {} ' 'team_1_id {} team_2_id {}' ).format(team1['game_id'], team2['game_id'], team1['team_id'], team2['team_id']) logger.critical(err) continue game_results.append({ 'id': winner['game_id'], 'game_id': winner['game_id'], 'winner_id': winner['team_id'], 'loser_id': loser['team_id'], }) solr_url = SOLR_URL + GAME_RESULTS_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('GameResults ingestion complete')
def load_commentary(commentary_dirs): '''Load commentary data into a solr instance. :param commentary_dirs: A dict containing commentary directory information for loading into solr. Assuming the standard NBA data directory, commentary_dirs should contain keys listing the matching folder names for 'notebook', 'preview', and 'recaps'. ''' logger.info('Starting Commentary ingestion: ' + str(commentary_dirs)) preview_dir = commentary_dirs['preview'] recap_dir = commentary_dirs['recap'] notebook_dir = commentary_dirs['notebook'] # Get lists of all the files we need to load preview_files = [os.path.join(preview_dir, f) for f in os.listdir(preview_dir) if os.path.isfile(os.path.join(preview_dir, f))] preview_files.sort() recap_files = [os.path.join(recap_dir, f) for f in os.listdir(recap_dir) if os.path.isfile(os.path.join(recap_dir, f))] recap_files.sort() notebook_files = [os.path.join(notebook_dir, f) for f in os.listdir(notebook_dir) if os.path.isfile(os.path.join(notebook_dir, f))] notebook_files.sort() # Determine the number of threads it will take to process that many files total_threads = len(preview_files) / GAME_COMMENTARY_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len(preview_files) % GAME_COMMENTARY_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_COMMENTARY_FILES_PER_THREAD split_preview_files = [preview_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads)] split_recap_files = [recap_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads)] split_notebook_files = [notebook_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads)] split_data_files = [(split_preview_files[i], split_recap_files[i], split_notebook_files[i]) for i in range(len(split_preview_files))] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_commentary_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set results = list(itertools.chain.from_iterable(results)) solr_url = SOLR_URL + GAME_COMMENTARY_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('Commentary ingestions complete')