Ejemplo n.º 1
0
def get_wikipedia_loaders(branch_date: str):
    loaders = []
    master_writer = get_df_table_writer('word_frequency',
                                        get_master_df_builder(),
                                        pk_cols=['word'],
                                        import_mode='replace')
    message = 'Update Wikipedia word frequencies for {} XML dump'.format(
        branch_date)
    loaders.append(get_dolt_loader([master_writer], True, message, 'master'))

    loaders.append(get_branch_creator(branch_date))

    for filter_name in FILTER_NAMES:
        filter_writer = get_df_table_writer('word_frequency',
                                            get_filter_df_builder(filter_name),
                                            pk_cols=['word'],
                                            import_mode='replace')
        branch_name = '{}/filter_{}'.format(branch_date, filter_name)
        filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format(
            branch_date, filter_name)
        loaders.append(
            get_dolt_loader([filter_writer], True, filter_message,
                            branch_name))

    return loaders
Ejemplo n.º 2
0
def load(branch_date: str):
    loaders = []
    master_writer = get_df_table_writer('word_frequency',
                                        get_master_df_builder(),
                                        pk_cols=['word'],
                                        import_mode='replace')
    message = 'Update Wikipedia word frequencies for {} XML dump'.format(
        branch_date)
    loaders.append(get_dolt_loader([master_writer], True, message, 'master'))

    loaders.append(get_branch_creator(branch_date))

    for filter_name in FILTER_NAMES:
        filter_writer = get_df_table_writer('word_frequency',
                                            get_filter_df_builder(filter_name),
                                            pk_cols=['word'],
                                            import_mode='replace')
        branch_name = '{}/filter_{}'.format(branch_date, filter_name)
        filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format(
            branch_date, filter_name)
        loaders.append(
            get_dolt_loader([filter_writer], True, filter_message,
                            branch_name))

    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
Ejemplo n.º 3
0
def _populate_test_data_helper(repo: Dolt,
                               mens: pd.DataFrame,
                               womens: pd.DataFrame,
                               branch: str = 'master'):
    table_loaders = [
        get_df_table_writer(MENS_MAJOR_COUNT, lambda: mens, ['name']),
        get_df_table_writer(WOMENS_MAJOR_COUNT, lambda: womens, ['name'])
    ]
    get_dolt_loader(table_loaders,
                    True,
                    'Loaded {} and {}'.format(MENS_MAJOR_COUNT,
                                              WOMENS_MAJOR_COUNT),
                    branch=branch)(repo)
    return repo
Ejemplo n.º 4
0
    def writeData(self, dataFrame: pd.DataFrame, requiredKeys: list):
        # Prepare Data Writer
        raw_data_writer = get_df_table_writer(config.ARCHIVE_TWEETS_TABLE,
                                              lambda: dataFrame, requiredKeys)

        # Write Data To Repo
        raw_data_writer(self.repo)
Ejemplo n.º 5
0
def writeData(repo: Dolt, table: str, dataFrame: pd.DataFrame,
              requiredKeys: list):
    # Prepare Data Writer
    raw_data_writer = get_df_table_writer(table, lambda: dataFrame,
                                          requiredKeys)

    # Write Data To Repo
    raw_data_writer(repo)
Ejemplo n.º 6
0
def get_loaders():
    loaders = [
        get_df_table_writer(poll.name, poll.get_dataset_fetcher(),
                            poll.primary_keys) for poll in DATASETS
    ]
    return [
        get_dolt_loader(loaders, True,
                        'Updated poll data {}'.format(datetime.now()))
    ]
Ejemplo n.º 7
0
def get_dolt_datasets():
    table_writers = []
    for ip_to_country_dataset in ip_to_country_datasets:
        writer = get_df_table_writer(ip_to_country_dataset.name,
                                     get_df_builder(ip_to_country_dataset),
                                     ip_to_country_dataset.pk_cols)
        table_writers.append(writer)

    return [get_dolt_loader(table_writers, True, 'Update IP to Country for date {}'.format(datetime.now()))]
Ejemplo n.º 8
0
def get_raw_table_loaders():
    raw_table_loaders = [
        get_df_table_writer('eur_fx_rates', get_data,
                            ['currency', 'timestamp'])
    ]
    return [
        get_dolt_loader(
            raw_table_loaders, True,
            'Updated raw FX rates for date {}'.format(datetime.now()))
    ]
Ejemplo n.º 9
0
def get_game_table_loaders(date_from: datetime, date_to: datetime):
    games_loaders = [
        get_df_table_writer('games', get_games_df_builder(date_from, date_to),
                            ['GAME_ID', 'TEAM_ID'])
    ]
    return [
        get_dolt_loader(
            games_loaders, True,
            'Append games between {} and {}'.format(date_from, date_to))
    ]
def write_results_to_dolt(results_file: str, remote: str, branch: str):
    table_writer = get_df_table_writer(RESULTS_TABLE,
                                       lambda: pd.read_csv(results_file),
                                       RESULTS_TABLE_PKS,
                                       import_mode='update')
    loader = get_dolt_loader(table_writer, True, 'benchmark run', branch)
    load_to_dolthub(loader,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=remote)
Ejemplo n.º 11
0
def get_loaders():
    loaders = [
        get_df_table_writer(elo_dataset.name,
                            elo_dataset.get_dataset_fetcher(),
                            elo_dataset.primary_keys)
        for elo_dataset in ELO_DATASETS
    ]
    return [
        get_dolt_loader(loaders, True,
                        'Updated NBA ELO data for {}'.format(datetime.now()))
    ]
Ejemplo n.º 12
0
def get_loaders(start_year: int, end_year: int):
    writers = [
        get_df_table_writer(
            'public_holidays',
            _get_holidays_for_year(year, _get_codename_lookup()), PK_COLS)
        for year in range(start_year, end_year)
    ]

    return [
        get_dolt_loader(
            writers, True, 'Update public holidays for years {} to {}'.format(
                start_year, end_year))
    ]
Ejemplo n.º 13
0
def get_writers(date_string: str, article_count: int, lower=''):
    writers = []

    for ngram_name in NGRAM_DICTS.keys():
        logging.info('Starting merge for {}s'.format(ngram_name))
        merge_csvs(ngram_name, lower)
        logging.info('Successfully merged all {} {} csvs'.format(
            lower, ngram_name))
        table_name = ngram_name + '_counts'
        writers.append(
            get_df_table_writer(table_name,
                                get_ngram_df_builder(ngram_name, lower),
                                [ngram_name],
                                import_mode='replace'))
    if len(lower) == 0:
        writers.append(
            get_df_table_writer('total_counts',
                                get_counts_df_builder(date_string,
                                                      article_count),
                                ['dump_date'],
                                import_mode='update'))

    return writers
Ejemplo n.º 14
0
def test_insert_unique_key(init_repo):
    repo = init_repo

    def generate_data():
        return pd.DataFrame({'id': [1, 1, 2], 'value': ['foo', 'foo', 'baz']})

    test_table = 'test_data'
    get_dolt_loader([
        get_df_table_writer(test_table,
                            generate_data, ['hash_id'],
                            transformers=[insert_unique_key])
    ], True, 'Updating test data')(repo)
    result = repo.read_table(test_table)
    assert result.loc[result['id'] == 1,
                      'count'].iloc[0] == 2 and 'hash_id' in result.columns
Ejemplo n.º 15
0
def write_results_to_dolt(results_dir: str, remote: str, branch: str):
    dfs = [
        pd.read_csv(os.path.join(results_dir, filename))
        for filename in os.listdir(results_dir)
    ]
    table_writer = get_df_table_writer(RESULTS_TABLE,
                                       lambda: pd.concat(dfs),
                                       RESULTS_TABLE_PKS,
                                       import_mode='update')
    loader = get_dolt_loader(table_writer, True, 'benchmark run', branch)
    load_to_dolthub(loader,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=remote)
Ejemplo n.º 16
0
def get_loaders():
    table_writers = []
    for dataset in DATASETS:
        tramsformers = [] if dataset.pk_cols else [insert_unique_key]
        pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols

        writer = get_df_table_writer(dataset.table_name,
                                     get_mta_data_as_df(
                                         get_mta_url(dataset.dataset_id)),
                                     pk_cols,
                                     transformers=tramsformers)

        table_writers.append(writer)

    return [
        get_dolt_loader(table_writers, True,
                        'Update MTA data for date {}'.format(datetime.now()))
    ]
Ejemplo n.º 17
0
def load():
    table_writers = []
    for dataset in DATASETS:
        tramsformers = [] if dataset.pk_cols else [insert_unique_key]
        pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols

        writer = get_df_table_writer(dataset.table_name,
                                     get_mta_data_as_df(
                                         get_mta_url(dataset.dataset_id)),
                                     pk_cols,
                                     transformers=tramsformers)

        table_writers.append(writer)

    loaders = [
        get_dolt_loader(table_writers, True,
                        'Update MTA data for date {}'.format(datetime.now()))
    ]
    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
Ejemplo n.º 18
0
def load_dataset(repo_path: str, datasets: List[FiveThirtyEightDataset], message: str):
    table_writers = [get_df_table_writer(ds.name, ds.get_dataset_fetcher(), ds.primary_keys) for ds in datasets]
    loaders = [get_dolt_loader(table_writers, True, message)]
    load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=repo_path)
Ejemplo n.º 19
0
def load_raw_fx_rates():
    table_writer = get_df_table_writer('eur_fx_rates', get_raw_data, ['currency', 'timestamp'])
    message = 'Updated raw FX rates for date {}'.format(datetime.now())
    loader = get_dolt_loader(table_writer, commit=True, message=message)
    load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
Ejemplo n.º 20
0
def get_loaders():
    writer = get_df_table_writer('great_players',
                                 get_data_builder(),
                                 pk_cols=['name'],
                                 import_mode='create')
    return [get_dolt_loader([writer], True, 'Added some great players!')]
Ejemplo n.º 21
0
def load(git_hash: str, github_actions_run_url: str):
    table_writers = [get_df_table_writer('eod_data', get_data, ['date', 'ticker'], 'update')]
    loaders = [get_dolt_loader(table_writers, True, get_commit_message(git_hash, github_actions_run_url))]
    load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REMOTE_DB)
Ejemplo n.º 22
0
def read_large_npi_file():
    # Columns To Keep
    # NPI
    # "Provider Organization Name (Legal Business Name)"
    # "Provider Last Name (Legal Name)"
    # "Provider First Name"
    # "Provider Middle Name"
    # "Provider Name Prefix Text"
    # "Provider Name Suffix Text"
    # "Provider First Line Business Practice Location Address"
    # "Provider Second Line Business Practice Location Address"
    # "Provider Business Practice Location Address City Name"
    # "Provider Business Practice Location Address State Name"
    # "Provider Business Practice Location Address Postal Code"
    # Provider Enumeration Date
    print("Reading In NPI Data")

    npi_columns: List[str] = [
        "NPI",
        "Provider Organization Name (Legal Business Name)",
        # "Provider Last Name (Legal Name)",
        # "Provider First Name",
        # "Provider Middle Name",
        # "Provider Name Prefix Text",
        # "Provider Name Suffix Text",
        "Provider First Line Business Practice Location Address",
        "Provider Second Line Business Practice Location Address",
        "Provider Business Practice Location Address City Name",
        "Provider Business Practice Location Address State Name",
        "Provider Business Practice Location Address Postal Code",
        "Provider Business Practice Location Address Country Code (If outside U.S.)",
        "NPI Deactivation Date",
        "NPI Reactivation Date",

        # Taxonomic Codes
        "Healthcare Provider Taxonomy Code_1",
        "Healthcare Provider Taxonomy Code_2",
        "Healthcare Provider Taxonomy Code_3",
        "Healthcare Provider Taxonomy Code_4",
        "Healthcare Provider Taxonomy Code_5",
        "Healthcare Provider Taxonomy Code_6",
        "Healthcare Provider Taxonomy Code_7",
        "Healthcare Provider Taxonomy Code_8",
        "Healthcare Provider Taxonomy Code_9",
        "Healthcare Provider Taxonomy Code_10",
        "Healthcare Provider Taxonomy Code_11",
        "Healthcare Provider Taxonomy Code_12",
        "Healthcare Provider Taxonomy Code_13",
        "Healthcare Provider Taxonomy Code_14",
        "Healthcare Provider Taxonomy Code_15"
    ]  # "Provider Enumeration Date"]

    npi_column_types: dict = {
        'NPI':
        'uint64',  # uint64
        'Provider Organization Name (Legal Business Name)':
        'string',
        # 'Provider Last Name (Legal Name)': 'string',
        # 'Provider First Name': 'string',
        # 'Provider Middle Name': 'string',
        # 'Provider Name Prefix Text': 'string',
        # 'Provider Name Suffix Text': 'string',
        'Provider First Line Business Practice Location Address':
        'string',
        'Provider Second Line Business Practice Location Address':
        'string',
        'Provider Business Practice Location Address City Name':
        'string',
        'Provider Business Practice Location Address State Name':
        'string',
        'Provider Business Practice Location Address Postal Code':
        'string',  # uint32
        'Provider Business Practice Location Address Country Code (If outside U.S.)':
        'string',
        'NPI Deactivation Date':
        'string',  # datetime
        'NPI Reactivation Date':
        'string',  # datetime

        # Taxonomic Codes
        'Healthcare Provider Taxonomy Code_1':
        'string',
        'Healthcare Provider Taxonomy Code_2':
        'string',
        'Healthcare Provider Taxonomy Code_3':
        'string',
        'Healthcare Provider Taxonomy Code_4':
        'string',
        'Healthcare Provider Taxonomy Code_5':
        'string',
        'Healthcare Provider Taxonomy Code_6':
        'string',
        'Healthcare Provider Taxonomy Code_7':
        'string',
        'Healthcare Provider Taxonomy Code_8':
        'string',
        'Healthcare Provider Taxonomy Code_9':
        'string',
        'Healthcare Provider Taxonomy Code_10':
        'string',
        'Healthcare Provider Taxonomy Code_11':
        'string',
        'Healthcare Provider Taxonomy Code_12':
        'string',
        'Healthcare Provider Taxonomy Code_13':
        'string',
        'Healthcare Provider Taxonomy Code_14':
        'string',
        'Healthcare Provider Taxonomy Code_15':
        'string'

        # 'Provider Enumeration Date': 'string'  # datetime
    }

    npi_data: pd.DataFrame = pd.read_csv(filepath_or_buffer=npi_data_path,
                                         usecols=npi_columns,
                                         dtype=npi_column_types)  # , nrows=100

    print("Dropping Non-US Businesses")
    # Drop All Not Null Rows Of "Provider Business Practice Location Address Country Code (If outside U.S.)"
    # TODO: Only Two Null Values To Care About Checking '421 RT 59', 'RUNDU INTERMEDIATE HOSPITAL'
    # npi_data = npi_data[
    #             npi_data["Provider Business Practice Location Address Country Code (If outside U.S.)"].isnull() or
    #             npi_data["Provider Business Practice Location Address Country Code (If outside U.S.)"] == "US"]
    npi_data = npi_data.loc[npi_data[
        "Provider Business Practice Location Address Country Code (If outside U.S.)"]
                            == "US"]

    print("Dropping Non-Family Owned Businesses")
    npi_data = npi_data[
        ~npi_data['Provider Organization Name (Legal Business Name)'].isnull()]

    print("Dropping Country Column")
    npi_data.drop(columns=[
        "Provider Business Practice Location Address Country Code (If outside U.S.)"
    ],
                  inplace=True)

    print("Dropping Non-Matching Taxonomic Codes")
    npi_data: pd.DataFrame = npi_data[
        npi_data['Healthcare Provider Taxonomy Code_1'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_2'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_3'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_4'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_5'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_6'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_7'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_8'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_9'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_10'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_11'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_12'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_13'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_14'].
        isin(hospital_taxonomic_codes)
        | npi_data['Healthcare Provider Taxonomy Code_15'].
        isin(hospital_taxonomic_codes)]

    print("Displaying Current State of DataFrame")
    print(npi_data)
    npi_data.info(verbose=False, memory_usage="deep")

    # print("Unique Values For Outside U.S.")
    # print(npi_data['Provider Business Practice Location Address Country Code (If outside U.S.)'].unique())

    # print("Unique Values For Address First Line")
    # print(npi_data['Provider First Line Business Practice Location Address'].unique())

    # print("Saving File To CSV For Backup")
    # npi_data.to_csv("working/npi_trimmed_backup.csv")

    print("Dropping Deactivated NPIs That Were Never Reactivated")
    npi_data = npi_data[~((~npi_data['NPI Deactivation Date'].isnull()) &
                          (npi_data['NPI Reactivation Date'].isnull()))]

    print("Filling All Null Data With Empty String")
    npi_data.fillna('', inplace=True)

    # name_columns: List[str] = ["Provider Organization Name (Legal Business Name)",
    #                            "Provider Name Prefix Text",
    #                            "Provider First Name",
    #                            "Provider Middle Name",
    #                            "Provider Last Name (Legal Name)",
    #                            "Provider Name Suffix Text"]

    # This Only Works Because Organization Name Will Always Be Null If Other Name Is Not Null And Vice Versa
    # print("Combining Name Columns To One Column")
    # npi_data["name"] = npi_data[name_columns].agg(' '.join, axis=1)

    # Destroy All Old Columns To Save Memory
    print("Dropping All Old Name Columns")
    # npi_data.drop(columns=name_columns, inplace=True)

    # Fix For Only One Space Between Columns
    # print("Ensuring Only One Space Between Words In Name")
    # npi_data["name"] = npi_data["name"].map(lambda x: ' '.join(x.split()))  # For Loop In Disguise :(
    # npi_data["name"] = ' '.join(npi_data["name"].str.split())

    # Uppercase All Names
    print("Uppercase Name Column")
    # npi_data["name"] = npi_data["name"].str.upper()
    npi_data["Provider Organization Name (Legal Business Name)"] = npi_data[
        "Provider Organization Name (Legal Business Name)"].str.upper()

    print("Displaying Current State of DataFrame")
    print(npi_data)
    npi_data.to_csv("working/npi_trimmed_name_backup.csv")

    print("Dropping Re/Deactivation Date Columns")
    npi_data.drop(columns=["NPI Deactivation Date", "NPI Reactivation Date"],
                  inplace=True)

    address_columns: List[str] = [
        "Provider First Line Business Practice Location Address",
        "Provider Second Line Business Practice Location Address"
    ]

    print("Combining Address Columns To One Column")
    npi_data["street_address"] = npi_data[address_columns].agg(', '.join,
                                                               axis=1)

    print("Dropping All Old Address Columns")
    npi_data.drop(columns=address_columns, inplace=True)

    print("Ensuring Only One Space Between Words In Address")
    npi_data["street_address"] = npi_data["street_address"].map(
        lambda x: ' '.join(x.split()))  # For Loop In Disguise :(
    # npi_data["street_address"] = ' '.join(npi_data["street_address"].str.split())

    print("Uppercase Address Column")
    npi_data["street_address"] = npi_data["street_address"].str.upper()

    rename_columns: dict = {
        "NPI": "npi_number",
        "Provider Organization Name (Legal Business Name)": "name",
        "Provider Business Practice Location Address City Name": "city",
        "Provider Business Practice Location Address State Name": "state",
        "Provider Business Practice Location Address Postal Code": "zip_code",
        # "Provider Enumeration Date": "publish_date"
    }

    print("Renaming Columns For Dolt Repo")
    npi_data.rename(columns=rename_columns, inplace=True)

    print("Replacing Empty Cells With Null")
    npi_data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    print("Fixing Addresses")
    npi_data['street_address'].replace(to_replace=r',$',
                                       value='',
                                       inplace=True,
                                       regex=True)

    print("Fixing Postal Codes")  # (\d{5})(\d{4})
    # npi_data['zip_code'].replace(to_replace=r'(\d{5})(\d{4})', value=r'\1-\2', inplace=True, regex=True)
    npi_data['zip_code'] = npi_data['zip_code'].str.replace(r'(\d{5})(\d{4})',
                                                            r'\1-\2',
                                                            regex=True)

    # print("Fixing Dates")  # mm/dd/yyyy -> yyyy-mm-dd
    # npi_data['publish_date'] = npi_data['publish_date'].str.replace(r'(\d{2})/(\d{2})/(\d{4})', r'\3-\1-\2', regex=True)

    # print("Resetting Index")
    # npi_data.reset_index(drop=True, inplace=True)

    print("Performing One Last Backup")
    npi_data.to_csv("working/npi_trimmed_final_backup.csv", index=False)

    print("Writing To Dolt Repo")
    raw_data_writer = get_df_table_writer('hospitals', lambda: npi_data,
                                          ['npi_number'])

    repo: Dolt = Dolt('working/hospital-price-transparency/')
    raw_data_writer(repo)

    print("Done")