Esempio n. 1
0
 def write_metadata(self, data: List[DoltMeta]):
     """Important that write metadata commit is recorded immediately after the data commit"""
     meta_df = pd.DataFrame.from_records(
         [x.dict() for x in self.table_reads + self.table_writes])
     import_df(repo=self.meta_doltdb,
               table_name="metadata",
               data=meta_df,
               primary_keys=meta_df.columns.tolist())
Esempio n. 2
0
 def inner(repo: Dolt):
     input_data = get_data(repo)
     transformed_data = transformer(input_data)
     import_df(repo,
               target_table,
               transformed_data,
               target_pk_cols,
               import_mode=import_mode)
     return target_table
Esempio n. 3
0
 def write_table(self, table_name: str, df: pd.DataFrame, pks: List[str]):
     """
     Writes the contents of the given DataFrame to the specified table. If the table exists it is updated, if it
     does not it is created.
     """
     assert current.is_running_flow, 'Writes and commits are only supported in a running Flow'
     import_df(repo=self.doltdb,
               table_name=table_name,
               data=df,
               primary_keys=pks)
     self.table_writes.append(self._get_table_write(table_name))
Esempio n. 4
0
def _import_and_commit(dolt: Dolt, table: str, data: pd.DataFrame,
                       primary_keys: Optional[List[str]], import_mode: str):
    dolt_write.import_df(dolt, table, pd.DataFrame(data), primary_keys,
                         import_mode)
    dolt.add(table)
    dolt.commit('Executed import on table {} in import mode "{}"'.format(
        table, import_mode))
    commit = dolt.log()[0]

    return {
        'commit_hash': commit.hash,
        'timestamp': commit.ts,
        'author': commit.author,
        'message': commit.message
    }
Esempio n. 5
0
def create_test_table(init_empty_test_repo,
                      create_test_data) -> Tuple[Dolt, str]:
    repo, test_data_path = init_empty_test_repo, create_test_data
    repo.sql(query='''
        CREATE TABLE `test_players` (
            `name` LONGTEXT NOT NULL COMMENT 'tag:0',
            `id` BIGINT NOT NULL COMMENT 'tag:1',
            PRIMARY KEY (`id`)
        );
    ''')
    import_df(repo, 'test_players', pd.read_csv(test_data_path), ['id'],
              UPDATE)
    yield repo, 'test_players'

    if 'test_players' in [table.name for table in repo.ls()]:
        _execute(['table', 'rm', 'test_players'], repo.repo_dir())
Esempio n. 6
0
    def inner(repo: Dolt):
        _transformers = transformers + [insert_unique_key
                                        ] if transformers else [
                                            insert_unique_key
                                        ]
        data = _apply_df_transformers(get_data(), _transformers)
        if table not in [t.name for t in repo.ls()]:
            raise ValueError('Missing table')

        # Get existing PKs
        existing = read_table(repo, table)
        existing_pks = existing[INSERTED_ROW_HASH_COL].to_list()

        # Get proposed PKs
        proposed_pks = data[INSERTED_ROW_HASH_COL].to_list()
        to_drop = [
            existing for existing in existing_pks
            if existing not in proposed_pks
        ]

        if to_drop:
            iterator = iter(to_drop)
            while iterator:
                batch = list(itertools.islice(iterator, 30000))
                if len(batch) == 0:
                    break

            logger.info('Dropping batch of {} IDs from table {}'.format(
                len(batch), table))
            drop_statement = '''
            DELETE FROM {table} WHERE {pk} in ("{pks_to_drop}")
            '''.format(table=table,
                       pk=INSERTED_ROW_HASH_COL,
                       pks_to_drop='","'.join(batch))
            repo.sql(query=drop_statement)

        new_data = data[~(data[INSERTED_ROW_HASH_COL].isin(existing_pks))]
        if not new_data.empty:
            logger.info('Importing {} records'.format(len(new_data)))
            import_df(repo, table, new_data, [INSERTED_ROW_HASH_COL], 'update')

        return table
Esempio n. 7
0
 def _insert_row_helper(repo, table, row):
     import_df(repo, table, row, ['id'], import_mode=UPDATE)
Esempio n. 8
0
def test_get_dirty_tables(create_test_table):
    repo, test_table = create_test_table
    message = 'Committing test data'

    # Some test data
    initial = pd.DataFrame({
        'id': [1],
        'name': ['Bianca'],
        'role': ['Champion']
    })
    appended_row = pd.DataFrame({
        'name': ['Serena'],
        'id': [2],
        'role': ['Runner-up']
    })

    def _insert_row_helper(repo, table, row):
        import_df(repo, table, row, ['id'], import_mode=UPDATE)

    # existing, not modified
    repo.add(test_table)
    repo.commit(message)

    # existing, modified, staged
    modified_staged = 'modified_staged'
    import_df(repo, modified_staged, initial, ['id'])
    repo.add(modified_staged)

    # existing, modified, unstaged
    modified_unstaged = 'modified_unstaged'
    import_df(repo, modified_unstaged, initial, ['id'])
    repo.add(modified_unstaged)

    # Commit and modify data
    repo.commit(message)
    _insert_row_helper(repo, modified_staged, appended_row)
    import_df(repo, modified_staged, appended_row, ['id'], UPDATE)
    repo.add(modified_staged)
    import_df(repo, modified_unstaged, appended_row, ['id'], UPDATE)

    # created, staged
    created_staged = 'created_staged'
    import_df(repo, created_staged, initial, ['id'])
    repo.add(created_staged)

    # created, unstaged
    created_unstaged = 'created_unstaged'
    import_df(repo, created_unstaged, initial, ['id'])

    status = repo.status()

    expected_new_tables = {'created_staged': True, 'created_unstaged': False}
    expected_changes = {'modified_staged': True, 'modified_unstaged': False}

    assert status.added_tables == expected_new_tables
    assert status.modified_tables == expected_changes
Esempio n. 9
0
def load_to_dolt(df, table):
    repo = Dolt(DOLT_REPO)
    import_df(repo, table, df, ['year', 'month'] + crosswalk_table_to_pk[table], import_mode='update')
Esempio n. 10
0
    exit_code = process.wait()

    return output


repo_name = 'Liquidata/online-services'
root = '.'
repo = Dolt.clone(repo_name, root)

documents_df = read_table(repo, 'documents')
documents_df['terms_raw'] = documents_df['terms_raw'].astype(str)
documents_df['privacy_raw'] = documents_df['privacy_raw'].astype(str)

for index, row in documents_df.iterrows():
    print(f'Processing {index}')
    documents_df.at[index, 'terms_raw'] = scrape_document(row['terms_url'])
    documents_df.at[index, 'privacy_raw'] = scrape_document(row['privacy_url'])

import_df(repo, 'documents', documents_df, ['product_id'])

if repo.status().is_clean:
    print('No changes to repo. Exiting')
else:
    print('Commiting and pushing to DoltHub')
    repo.add('documents')

    now = datetime.datetime.now()
    print(f'Latest documents downloaded {now}')
    repo.commit(f'Latest data downloaded {now}')
    repo.push('origin', 'master')
Esempio n. 11
0
 def inner(repo: Dolt):
     _import_mode = import_mode or (
         'create' if table not in [t.name for t in repo.ls()] else 'update')
     data_to_load = _apply_df_transformers(get_data(), transformers)
     import_df(repo, table, data_to_load, pk_cols, import_mode=_import_mode)
     return table