Exemple #1
0
def load(date_string: str, dump_target: str):
    """
    Gets case-sensitive and case-insensitive loader for each Wikipedia dump
    Each loader has a writer for each table
    :param date_string:
    :param dump_target:
    :return:
    """
    loaders = []
    article_count = fetch_data(dump_target)

    # Get case-sensitive ngrams
    writers = get_writers(date_string, article_count)
    message = 'Update case-sensitive ngrams for dump date {}'.format(
        date_string)
    loaders.append(get_dolt_loader(writers, True, message))
    loaders.append(get_branch_creator('{}/case-sensitive'.format(date_string)))

    # Get case-insensitive ngrams
    l_message = 'Update case-insensitive ngrams for dump date {}'.format(
        date_string)
    l_writers = get_writers(date_string, article_count, lower='_lower')
    loaders.append(
        get_dolt_loader(l_writers, True, l_message,
                        '{}/case-insensitive'.format(date_string)))

    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
def load(branch_date: str):
    loaders = []
    master_writer = get_df_table_writer('word_frequency',
                                        get_master_df_builder(),
                                        pk_cols=['word'],
                                        import_mode='replace')
    message = 'Update Wikipedia word frequencies for {} XML dump'.format(
        branch_date)
    loaders.append(get_dolt_loader([master_writer], True, message, 'master'))

    loaders.append(get_branch_creator(branch_date))

    for filter_name in FILTER_NAMES:
        filter_writer = get_df_table_writer('word_frequency',
                                            get_filter_df_builder(filter_name),
                                            pk_cols=['word'],
                                            import_mode='replace')
        branch_name = '{}/filter_{}'.format(branch_date, filter_name)
        filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format(
            branch_date, filter_name)
        loaders.append(
            get_dolt_loader([filter_writer], True, filter_message,
                            branch_name))

    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
def write_results_to_dolt(results_file: str, remote: str, branch: str):
    table_writer = get_df_table_writer(RESULTS_TABLE,
                                       lambda: pd.read_csv(results_file),
                                       RESULTS_TABLE_PKS,
                                       import_mode='update')
    loader = get_dolt_loader(table_writer, True, 'benchmark run', branch)
    load_to_dolthub(loader,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=remote)
def write_results_to_dolt(results_dir: str, remote: str, branch: str):
    dfs = [
        pd.read_csv(os.path.join(results_dir, filename))
        for filename in os.listdir(results_dir)
    ]
    table_writer = get_df_table_writer(RESULTS_TABLE,
                                       lambda: pd.concat(dfs),
                                       RESULTS_TABLE_PKS,
                                       import_mode='update')
    loader = get_dolt_loader(table_writer, True, 'benchmark run', branch)
    load_to_dolthub(loader,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=remote)
Exemple #5
0
def load():
    table_writers = []
    for dataset in DATASETS:
        tramsformers = [] if dataset.pk_cols else [insert_unique_key]
        pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols

        writer = get_df_table_writer(dataset.table_name,
                                     get_mta_data_as_df(
                                         get_mta_url(dataset.dataset_id)),
                                     pk_cols,
                                     transformers=tramsformers)

        table_writers.append(writer)

    loaders = [
        get_dolt_loader(table_writers, True,
                        'Update MTA data for date {}'.format(datetime.now()))
    ]
    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
def load_dataset(repo_path: str, datasets: List[FiveThirtyEightDataset], message: str):
    table_writers = [get_df_table_writer(ds.name, ds.get_dataset_fetcher(), ds.primary_keys) for ds in datasets]
    loaders = [get_dolt_loader(table_writers, True, message)]
    load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=repo_path)
Exemple #7
0
def load_fx_rates_running_averages():
    table_writer = get_table_transformer(get_raw_fx_rates, 'eur_fx_rate_averages', ['currency'], get_average_rates)
    loader = get_dolt_loader(table_writer, True, 'Updated averages for date {}'.format(datetime.now()))
    load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
Exemple #8
0
def load_raw_fx_rates():
    table_writer = get_df_table_writer('eur_fx_rates', get_raw_data, ['currency', 'timestamp'])
    message = 'Updated raw FX rates for date {}'.format(datetime.now())
    loader = get_dolt_loader(table_writer, commit=True, message=message)
    load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
Exemple #9
0
def load(git_hash: str, github_actions_run_url: str):
    table_writers = [get_df_table_writer('eod_data', get_data, ['date', 'ticker'], 'update')]
    loaders = [get_dolt_loader(table_writers, True, get_commit_message(git_hash, github_actions_run_url))]
    load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REMOTE_DB)