Ejemplo n.º 1
0
def load(date_string: str, dump_target: str):
    """
    Gets case-sensitive and case-insensitive loader for each Wikipedia dump
    Each loader has a writer for each table
    :param date_string:
    :param dump_target:
    :return:
    """
    loaders = []
    article_count = fetch_data(dump_target)

    # Get case-sensitive ngrams
    writers = get_writers(date_string, article_count)
    message = 'Update case-sensitive ngrams for dump date {}'.format(
        date_string)
    loaders.append(get_dolt_loader(writers, True, message))
    loaders.append(get_branch_creator('{}/case-sensitive'.format(date_string)))

    # Get case-insensitive ngrams
    l_message = 'Update case-insensitive ngrams for dump date {}'.format(
        date_string)
    l_writers = get_writers(date_string, article_count, lower='_lower')
    loaders.append(
        get_dolt_loader(l_writers, True, l_message,
                        '{}/case-insensitive'.format(date_string)))

    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
Ejemplo n.º 2
0
def get_wikipedia_loaders(branch_date: str):
    loaders = []
    master_writer = get_df_table_writer('word_frequency',
                                        get_master_df_builder(),
                                        pk_cols=['word'],
                                        import_mode='replace')
    message = 'Update Wikipedia word frequencies for {} XML dump'.format(
        branch_date)
    loaders.append(get_dolt_loader([master_writer], True, message, 'master'))

    loaders.append(get_branch_creator(branch_date))

    for filter_name in FILTER_NAMES:
        filter_writer = get_df_table_writer('word_frequency',
                                            get_filter_df_builder(filter_name),
                                            pk_cols=['word'],
                                            import_mode='replace')
        branch_name = '{}/filter_{}'.format(branch_date, filter_name)
        filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format(
            branch_date, filter_name)
        loaders.append(
            get_dolt_loader([filter_writer], True, filter_message,
                            branch_name))

    return loaders
Ejemplo n.º 3
0
def load(branch_date: str):
    loaders = []
    master_writer = get_df_table_writer('word_frequency',
                                        get_master_df_builder(),
                                        pk_cols=['word'],
                                        import_mode='replace')
    message = 'Update Wikipedia word frequencies for {} XML dump'.format(
        branch_date)
    loaders.append(get_dolt_loader([master_writer], True, message, 'master'))

    loaders.append(get_branch_creator(branch_date))

    for filter_name in FILTER_NAMES:
        filter_writer = get_df_table_writer('word_frequency',
                                            get_filter_df_builder(filter_name),
                                            pk_cols=['word'],
                                            import_mode='replace')
        branch_name = '{}/filter_{}'.format(branch_date, filter_name)
        filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format(
            branch_date, filter_name)
        loaders.append(
            get_dolt_loader([filter_writer], True, filter_message,
                            branch_name))

    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
Ejemplo n.º 4
0
def _populate_derived_data_helper(repo: Dolt, import_mode: str):
    table_transfomers = [
        get_table_transfomer(get_raw_data, AVERAGE_MAJOR_COUNT, ['gender'],
                             averager, import_mode)
    ]
    get_dolt_loader(table_transfomers, True,
                    'Updated {}'.format(AVERAGE_MAJOR_COUNT))(repo)
    return repo
Ejemplo n.º 5
0
def test_get_unique_key_update_writer(init_empty_test_repo):
    repo = init_empty_test_repo

    def generate_initial_data():
        return pd.DataFrame([{
            'name': 'Roger',
            'id': 1
        }, {
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Novak',
            'id': 3
        }])

    test_table = 'test_data'
    get_dolt_loader([
        get_unique_key_table_writer(
            test_table, generate_initial_data, import_mode='create')
    ], True, 'Create test data')(repo)

    # Test that we have what we expect
    data = read_pandas(repo, test_table)
    assert [
        data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1
        for player in ['Roger', 'Novak']
    ]
    assert data.loc[data['name'] == 'Rafael',
                    'count'].astype(float).iloc[0] == 2

    def generate_updated_data():
        return pd.DataFrame([{
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Novak',
            'id': 3
        }, {
            'name': 'Andy',
            'id': 4
        }])

    get_dolt_loader(
        [get_unique_key_table_writer(test_table, generate_updated_data)], True,
        'Updating data')(repo)
    data = read_pandas(repo, test_table)
    assert [
        data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1
        for player in ['Rafael', 'Novak', 'Andy']
    ]
Ejemplo n.º 6
0
def _populate_test_data_helper(repo: Dolt,
                               mens: pd.DataFrame,
                               womens: pd.DataFrame,
                               branch: str = 'master'):
    table_loaders = [
        get_df_table_writer(MENS_MAJOR_COUNT, lambda: mens, ['name']),
        get_df_table_writer(WOMENS_MAJOR_COUNT, lambda: womens, ['name'])
    ]
    get_dolt_loader(table_loaders,
                    True,
                    'Loaded {} and {}'.format(MENS_MAJOR_COUNT,
                                              WOMENS_MAJOR_COUNT),
                    branch=branch)(repo)
    return repo
Ejemplo n.º 7
0
def test_insert_unique_key(init_repo):
    repo = init_repo

    def generate_data():
        return pd.DataFrame({'id': [1, 1, 2], 'value': ['foo', 'foo', 'baz']})

    test_table = 'test_data'
    get_dolt_loader([
        get_df_table_writer(test_table,
                            generate_data, ['hash_id'],
                            transformers=[insert_unique_key])
    ], True, 'Updating test data')(repo)
    result = repo.read_table(test_table)
    assert result.loc[result['id'] == 1,
                      'count'].iloc[0] == 2 and 'hash_id' in result.columns
Ejemplo n.º 8
0
def get_dolt_datasets():
    table_writers = []
    for ip_to_country_dataset in ip_to_country_datasets:
        writer = get_df_table_writer(ip_to_country_dataset.name,
                                     get_df_builder(ip_to_country_dataset),
                                     ip_to_country_dataset.pk_cols)
        table_writers.append(writer)

    return [get_dolt_loader(table_writers, True, 'Update IP to Country for date {}'.format(datetime.now()))]
Ejemplo n.º 9
0
def get_loaders():
    loaders = [
        get_df_table_writer(poll.name, poll.get_dataset_fetcher(),
                            poll.primary_keys) for poll in DATASETS
    ]
    return [
        get_dolt_loader(loaders, True,
                        'Updated poll data {}'.format(datetime.now()))
    ]
Ejemplo n.º 10
0
def get_transformed_table_loaders():
    transformed_table_loaders = [
        get_table_transfomer(get_raw_fx_rates, 'eur_fx_rate_averages',
                             ['currency'], get_average_rates)
    ]
    return [
        get_dolt_loader(transformed_table_loaders, True,
                        'Updated averages for date {}'.format(datetime.now()))
    ]
Ejemplo n.º 11
0
def get_game_table_loaders(date_from: datetime, date_to: datetime):
    games_loaders = [
        get_df_table_writer('games', get_games_df_builder(date_from, date_to),
                            ['GAME_ID', 'TEAM_ID'])
    ]
    return [
        get_dolt_loader(
            games_loaders, True,
            'Append games between {} and {}'.format(date_from, date_to))
    ]
Ejemplo n.º 12
0
def get_raw_table_loaders():
    raw_table_loaders = [
        get_df_table_writer('eur_fx_rates', get_data,
                            ['currency', 'timestamp'])
    ]
    return [
        get_dolt_loader(
            raw_table_loaders, True,
            'Updated raw FX rates for date {}'.format(datetime.now()))
    ]
def write_results_to_dolt(results_file: str, remote: str, branch: str):
    table_writer = get_df_table_writer(RESULTS_TABLE,
                                       lambda: pd.read_csv(results_file),
                                       RESULTS_TABLE_PKS,
                                       import_mode='update')
    loader = get_dolt_loader(table_writer, True, 'benchmark run', branch)
    load_to_dolthub(loader,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=remote)
Ejemplo n.º 14
0
def get_loaders():
    loaders = [
        get_df_table_writer(elo_dataset.name,
                            elo_dataset.get_dataset_fetcher(),
                            elo_dataset.primary_keys)
        for elo_dataset in ELO_DATASETS
    ]
    return [
        get_dolt_loader(loaders, True,
                        'Updated NBA ELO data for {}'.format(datetime.now()))
    ]
Ejemplo n.º 15
0
def get_loaders(start_year: int, end_year: int):
    writers = [
        get_df_table_writer(
            'public_holidays',
            _get_holidays_for_year(year, _get_codename_lookup()), PK_COLS)
        for year in range(start_year, end_year)
    ]

    return [
        get_dolt_loader(
            writers, True, 'Update public holidays for years {} to {}'.format(
                start_year, end_year))
    ]
Ejemplo n.º 16
0
def get_play_by_play_table_loaders(game_date: datetime):
    play_by_play_loaders = [
        get_table_transformer(
            get_games_for_date_builder(game_date),
            'play_by_play',
            # figure out primary key for play-by-play
            [],
            games_to_play_by_play)
    ]
    return [
        get_dolt_loader(
            play_by_play_loaders, True,
            'Updated play_by_play for game date {}'.format(
                game_date.strftime('%Y-%m-%d')))
    ]
Ejemplo n.º 17
0
def write_results_to_dolt(results_dir: str, remote: str, branch: str):
    dfs = [
        pd.read_csv(os.path.join(results_dir, filename))
        for filename in os.listdir(results_dir)
    ]
    table_writer = get_df_table_writer(RESULTS_TABLE,
                                       lambda: pd.concat(dfs),
                                       RESULTS_TABLE_PKS,
                                       import_mode='update')
    loader = get_dolt_loader(table_writer, True, 'benchmark run', branch)
    load_to_dolthub(loader,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=remote)
Ejemplo n.º 18
0
def get_loaders():
    table_writers = []
    for dataset in DATASETS:
        tramsformers = [] if dataset.pk_cols else [insert_unique_key]
        pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols

        writer = get_df_table_writer(dataset.table_name,
                                     get_mta_data_as_df(
                                         get_mta_url(dataset.dataset_id)),
                                     pk_cols,
                                     transformers=tramsformers)

        table_writers.append(writer)

    return [
        get_dolt_loader(table_writers, True,
                        'Update MTA data for date {}'.format(datetime.now()))
    ]
Ejemplo n.º 19
0
def load():
    table_writers = []
    for dataset in DATASETS:
        tramsformers = [] if dataset.pk_cols else [insert_unique_key]
        pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols

        writer = get_df_table_writer(dataset.table_name,
                                     get_mta_data_as_df(
                                         get_mta_url(dataset.dataset_id)),
                                     pk_cols,
                                     transformers=tramsformers)

        table_writers.append(writer)

    loaders = [
        get_dolt_loader(table_writers, True,
                        'Update MTA data for date {}'.format(datetime.now()))
    ]
    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
Ejemplo n.º 20
0
def load_raw_fx_rates():
    table_writer = get_df_table_writer('eur_fx_rates', get_raw_data, ['currency', 'timestamp'])
    message = 'Updated raw FX rates for date {}'.format(datetime.now())
    loader = get_dolt_loader(table_writer, commit=True, message=message)
    load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
Ejemplo n.º 21
0
def load_dataset(repo_path: str, datasets: List[FiveThirtyEightDataset], message: str):
    table_writers = [get_df_table_writer(ds.name, ds.get_dataset_fetcher(), ds.primary_keys) for ds in datasets]
    loaders = [get_dolt_loader(table_writers, True, message)]
    load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=repo_path)
Ejemplo n.º 22
0
def load_fx_rates_running_averages():
    table_writer = get_table_transformer(get_raw_fx_rates, 'eur_fx_rate_averages', ['currency'], get_average_rates)
    loader = get_dolt_loader(table_writer, True, 'Updated averages for date {}'.format(datetime.now()))
    load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
Ejemplo n.º 23
0
def get_loaders():
    writer = get_df_table_writer('great_players',
                                 get_data_builder(),
                                 pk_cols=['name'],
                                 import_mode='create')
    return [get_dolt_loader([writer], True, 'Added some great players!')]
Ejemplo n.º 24
0
def load(git_hash: str, github_actions_run_url: str):
    table_writers = [get_df_table_writer('eod_data', get_data, ['date', 'ticker'], 'update')]
    loaders = [get_dolt_loader(table_writers, True, get_commit_message(git_hash, github_actions_run_url))]
    load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REMOTE_DB)