def load(date_string: str, dump_target: str): """ Gets case-sensitive and case-insensitive loader for each Wikipedia dump Each loader has a writer for each table :param date_string: :param dump_target: :return: """ loaders = [] article_count = fetch_data(dump_target) # Get case-sensitive ngrams writers = get_writers(date_string, article_count) message = 'Update case-sensitive ngrams for dump date {}'.format( date_string) loaders.append(get_dolt_loader(writers, True, message)) loaders.append(get_branch_creator('{}/case-sensitive'.format(date_string))) # Get case-insensitive ngrams l_message = 'Update case-insensitive ngrams for dump date {}'.format( date_string) l_writers = get_writers(date_string, article_count, lower='_lower') loaders.append( get_dolt_loader(l_writers, True, l_message, '{}/case-insensitive'.format(date_string))) load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def get_wikipedia_loaders(branch_date: str): loaders = [] master_writer = get_df_table_writer('word_frequency', get_master_df_builder(), pk_cols=['word'], import_mode='replace') message = 'Update Wikipedia word frequencies for {} XML dump'.format( branch_date) loaders.append(get_dolt_loader([master_writer], True, message, 'master')) loaders.append(get_branch_creator(branch_date)) for filter_name in FILTER_NAMES: filter_writer = get_df_table_writer('word_frequency', get_filter_df_builder(filter_name), pk_cols=['word'], import_mode='replace') branch_name = '{}/filter_{}'.format(branch_date, filter_name) filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format( branch_date, filter_name) loaders.append( get_dolt_loader([filter_writer], True, filter_message, branch_name)) return loaders
def load(branch_date: str): loaders = [] master_writer = get_df_table_writer('word_frequency', get_master_df_builder(), pk_cols=['word'], import_mode='replace') message = 'Update Wikipedia word frequencies for {} XML dump'.format( branch_date) loaders.append(get_dolt_loader([master_writer], True, message, 'master')) loaders.append(get_branch_creator(branch_date)) for filter_name in FILTER_NAMES: filter_writer = get_df_table_writer('word_frequency', get_filter_df_builder(filter_name), pk_cols=['word'], import_mode='replace') branch_name = '{}/filter_{}'.format(branch_date, filter_name) filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format( branch_date, filter_name) loaders.append( get_dolt_loader([filter_writer], True, filter_message, branch_name)) load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def _populate_derived_data_helper(repo: Dolt, import_mode: str): table_transfomers = [ get_table_transfomer(get_raw_data, AVERAGE_MAJOR_COUNT, ['gender'], averager, import_mode) ] get_dolt_loader(table_transfomers, True, 'Updated {}'.format(AVERAGE_MAJOR_COUNT))(repo) return repo
def test_get_unique_key_update_writer(init_empty_test_repo): repo = init_empty_test_repo def generate_initial_data(): return pd.DataFrame([{ 'name': 'Roger', 'id': 1 }, { 'name': 'Rafael', 'id': 2 }, { 'name': 'Rafael', 'id': 2 }, { 'name': 'Novak', 'id': 3 }]) test_table = 'test_data' get_dolt_loader([ get_unique_key_table_writer( test_table, generate_initial_data, import_mode='create') ], True, 'Create test data')(repo) # Test that we have what we expect data = read_pandas(repo, test_table) assert [ data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1 for player in ['Roger', 'Novak'] ] assert data.loc[data['name'] == 'Rafael', 'count'].astype(float).iloc[0] == 2 def generate_updated_data(): return pd.DataFrame([{ 'name': 'Rafael', 'id': 2 }, { 'name': 'Novak', 'id': 3 }, { 'name': 'Andy', 'id': 4 }]) get_dolt_loader( [get_unique_key_table_writer(test_table, generate_updated_data)], True, 'Updating data')(repo) data = read_pandas(repo, test_table) assert [ data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1 for player in ['Rafael', 'Novak', 'Andy'] ]
def _populate_test_data_helper(repo: Dolt, mens: pd.DataFrame, womens: pd.DataFrame, branch: str = 'master'): table_loaders = [ get_df_table_writer(MENS_MAJOR_COUNT, lambda: mens, ['name']), get_df_table_writer(WOMENS_MAJOR_COUNT, lambda: womens, ['name']) ] get_dolt_loader(table_loaders, True, 'Loaded {} and {}'.format(MENS_MAJOR_COUNT, WOMENS_MAJOR_COUNT), branch=branch)(repo) return repo
def test_insert_unique_key(init_repo): repo = init_repo def generate_data(): return pd.DataFrame({'id': [1, 1, 2], 'value': ['foo', 'foo', 'baz']}) test_table = 'test_data' get_dolt_loader([ get_df_table_writer(test_table, generate_data, ['hash_id'], transformers=[insert_unique_key]) ], True, 'Updating test data')(repo) result = repo.read_table(test_table) assert result.loc[result['id'] == 1, 'count'].iloc[0] == 2 and 'hash_id' in result.columns
def get_dolt_datasets(): table_writers = [] for ip_to_country_dataset in ip_to_country_datasets: writer = get_df_table_writer(ip_to_country_dataset.name, get_df_builder(ip_to_country_dataset), ip_to_country_dataset.pk_cols) table_writers.append(writer) return [get_dolt_loader(table_writers, True, 'Update IP to Country for date {}'.format(datetime.now()))]
def get_loaders(): loaders = [ get_df_table_writer(poll.name, poll.get_dataset_fetcher(), poll.primary_keys) for poll in DATASETS ] return [ get_dolt_loader(loaders, True, 'Updated poll data {}'.format(datetime.now())) ]
def get_transformed_table_loaders(): transformed_table_loaders = [ get_table_transfomer(get_raw_fx_rates, 'eur_fx_rate_averages', ['currency'], get_average_rates) ] return [ get_dolt_loader(transformed_table_loaders, True, 'Updated averages for date {}'.format(datetime.now())) ]
def get_game_table_loaders(date_from: datetime, date_to: datetime): games_loaders = [ get_df_table_writer('games', get_games_df_builder(date_from, date_to), ['GAME_ID', 'TEAM_ID']) ] return [ get_dolt_loader( games_loaders, True, 'Append games between {} and {}'.format(date_from, date_to)) ]
def get_raw_table_loaders(): raw_table_loaders = [ get_df_table_writer('eur_fx_rates', get_data, ['currency', 'timestamp']) ] return [ get_dolt_loader( raw_table_loaders, True, 'Updated raw FX rates for date {}'.format(datetime.now())) ]
def write_results_to_dolt(results_file: str, remote: str, branch: str): table_writer = get_df_table_writer(RESULTS_TABLE, lambda: pd.read_csv(results_file), RESULTS_TABLE_PKS, import_mode='update') loader = get_dolt_loader(table_writer, True, 'benchmark run', branch) load_to_dolthub(loader, clone=True, push=True, remote_name='origin', remote_url=remote)
def get_loaders(): loaders = [ get_df_table_writer(elo_dataset.name, elo_dataset.get_dataset_fetcher(), elo_dataset.primary_keys) for elo_dataset in ELO_DATASETS ] return [ get_dolt_loader(loaders, True, 'Updated NBA ELO data for {}'.format(datetime.now())) ]
def get_loaders(start_year: int, end_year: int): writers = [ get_df_table_writer( 'public_holidays', _get_holidays_for_year(year, _get_codename_lookup()), PK_COLS) for year in range(start_year, end_year) ] return [ get_dolt_loader( writers, True, 'Update public holidays for years {} to {}'.format( start_year, end_year)) ]
def get_play_by_play_table_loaders(game_date: datetime): play_by_play_loaders = [ get_table_transformer( get_games_for_date_builder(game_date), 'play_by_play', # figure out primary key for play-by-play [], games_to_play_by_play) ] return [ get_dolt_loader( play_by_play_loaders, True, 'Updated play_by_play for game date {}'.format( game_date.strftime('%Y-%m-%d'))) ]
def write_results_to_dolt(results_dir: str, remote: str, branch: str): dfs = [ pd.read_csv(os.path.join(results_dir, filename)) for filename in os.listdir(results_dir) ] table_writer = get_df_table_writer(RESULTS_TABLE, lambda: pd.concat(dfs), RESULTS_TABLE_PKS, import_mode='update') loader = get_dolt_loader(table_writer, True, 'benchmark run', branch) load_to_dolthub(loader, clone=True, push=True, remote_name='origin', remote_url=remote)
def get_loaders(): table_writers = [] for dataset in DATASETS: tramsformers = [] if dataset.pk_cols else [insert_unique_key] pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols writer = get_df_table_writer(dataset.table_name, get_mta_data_as_df( get_mta_url(dataset.dataset_id)), pk_cols, transformers=tramsformers) table_writers.append(writer) return [ get_dolt_loader(table_writers, True, 'Update MTA data for date {}'.format(datetime.now())) ]
def load(): table_writers = [] for dataset in DATASETS: tramsformers = [] if dataset.pk_cols else [insert_unique_key] pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols writer = get_df_table_writer(dataset.table_name, get_mta_data_as_df( get_mta_url(dataset.dataset_id)), pk_cols, transformers=tramsformers) table_writers.append(writer) loaders = [ get_dolt_loader(table_writers, True, 'Update MTA data for date {}'.format(datetime.now())) ] load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def load_raw_fx_rates(): table_writer = get_df_table_writer('eur_fx_rates', get_raw_data, ['currency', 'timestamp']) message = 'Updated raw FX rates for date {}'.format(datetime.now()) loader = get_dolt_loader(table_writer, commit=True, message=message) load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
def load_dataset(repo_path: str, datasets: List[FiveThirtyEightDataset], message: str): table_writers = [get_df_table_writer(ds.name, ds.get_dataset_fetcher(), ds.primary_keys) for ds in datasets] loaders = [get_dolt_loader(table_writers, True, message)] load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=repo_path)
def load_fx_rates_running_averages(): table_writer = get_table_transformer(get_raw_fx_rates, 'eur_fx_rate_averages', ['currency'], get_average_rates) loader = get_dolt_loader(table_writer, True, 'Updated averages for date {}'.format(datetime.now())) load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
def get_loaders(): writer = get_df_table_writer('great_players', get_data_builder(), pk_cols=['name'], import_mode='create') return [get_dolt_loader([writer], True, 'Added some great players!')]
def load(git_hash: str, github_actions_run_url: str): table_writers = [get_df_table_writer('eod_data', get_data, ['date', 'ticker'], 'update')] loaders = [get_dolt_loader(table_writers, True, get_commit_message(git_hash, github_actions_run_url))] load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REMOTE_DB)