def test_dataframe_table_loader_create(initial_test_data): repo = initial_test_data womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert womens_data.iloc[0]['name'] == 'Serena' assert mens_data.iloc[0]['name'] == 'Roger'
def test_dataframe_table_loader_update(update_test_data): repo = update_test_data womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) assert 'Rafael' in list(mens_data['name'])
def test_get_unique_key_update_writer(init_empty_test_repo): repo = init_empty_test_repo def generate_initial_data(): return pd.DataFrame([{ 'name': 'Roger', 'id': 1 }, { 'name': 'Rafael', 'id': 2 }, { 'name': 'Rafael', 'id': 2 }, { 'name': 'Novak', 'id': 3 }]) test_table = 'test_data' get_dolt_loader([ get_unique_key_table_writer( test_table, generate_initial_data, import_mode='create') ], True, 'Create test data')(repo) # Test that we have what we expect data = read_table(repo, test_table) assert [ data.loc[data['name'] == player, 'count'].iloc[0] == 1 for player in ['Roger', 'Novak'] ] assert data.loc[data['name'] == 'Rafael', 'count'].iloc[0] == 2 def generate_updated_data(): return pd.DataFrame([{ 'name': 'Rafael', 'id': 2 }, { 'name': 'Novak', 'id': 3 }, { 'name': 'Andy', 'id': 4 }]) get_dolt_loader( [get_unique_key_table_writer(test_table, generate_updated_data)], True, 'Updating data')(repo) data = read_table(repo, test_table) assert [ data.loc[data['name'] == player, 'count'].iloc[0] == 1 for player in ['Rafael', 'Novak', 'Andy'] ]
def test_table_transfomer_update(update_derived_data): repo = update_derived_data avg_df = read_table(repo, AVERAGE_MAJOR_COUNT) assert avg_df.loc[avg_df['gender'] == 'mens', 'average'].iloc[0] == (20 + 19) / 2 assert avg_df.loc[avg_df['gender'] == 'womens', 'average'].iloc[0] == (23 + 24) / 2
def read_table(self, table_name: str, commit: str = None, flow_name: str = None, run_id: str = None) -> pd.DataFrame: """ Returns the specified tables as a DataFrame. """ if not current.is_running_flow: raise ValueError("read_table is only supported in a running Flow") read_meta = self._get_table_read(table_name) if commit: table = self._get_dolt_table_asof(self.doltdb, table_name, commit) read_meta.commit = commit elif flow_name and run_id: df = read_table_sql(self.meta_doltdb, _get_actions_query(flow_name, run_id, 'read')) database = df.database.values[0] commit = df.commit.values[0] # checkout database and get table ASOF commit db = Dolt(database) table = self._get_dolt_table_asof(db, table_name, commit) read_meta.commit = commit else: table = read_table(self.doltdb, table_name) read_meta.commit = self._get_latest_commit_hash() self.table_reads.append(read_meta) return table
def test_branching(initial_test_data): repo = initial_test_data test_branch = 'new-branch' repo.branch(branch_name=test_branch) repo.checkout(test_branch) _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch) current_branch, _ = repo.branch() assert current_branch.name == test_branch womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) assert 'Rafael' in list(mens_data['name']) repo.checkout('master') womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert 'Margaret' not in list(womens_data['name']) assert 'Rafael' not in list(mens_data['name'])
def test_sql(create_test_table): repo, test_table = create_test_table sql = ''' INSERT INTO {table} (name, id) VALUES ('Roger', 3) '''.format(table=test_table) repo.sql(query=sql) test_data = read_table(repo, test_table) assert 'Roger' in test_data['name'].to_list()
def read(): payload = request.get_json() table, table_error = _extract_parameter(payload, 'table', str) branch, branch_error = _extract_parameter(payload, 'branch', str) # validate that we don't have errors with DoltCheckoutContext(DOLT, branch, False): data = dolt_read.read_table(DOLT, table).to_dict('rows') return jsonify(data)
def test_multi_branch_load(initial_test_data): repo = initial_test_data first_branch, second_branch = 'first-branch', 'second-branch' _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, first_branch) _populate_test_data_helper(repo, SECOND_UPDATE_MENS, SECOND_UPDATE_WOMENS, second_branch) womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert 'Margaret' not in list( womens_data['name']) and 'Rafael' not in list(mens_data['name']) assert 'Steffi' not in list(womens_data['name']) and 'Novak' not in list( mens_data['name']) repo.checkout(first_branch) womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list( mens_data['name']) repo.checkout(second_branch) womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert 'Steffi' in list(womens_data['name']) and 'Novak' in list( mens_data['name'])
def test_load_to_dolt_new_branch(initial_test_data): repo = initial_test_data test_branch = 'new-branch' # check we have only the expected branches in the sample data _, branches = repo.branch() assert [b.name for b in branches] == ['master'] # load some data to a new branch _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch) # check that we are still on the branch we started on current_branch, current_branches = repo.branch() assert current_branch.name == 'master' and [ b.name for b in current_branches ] == ['master', test_branch] # check out our new branch and confirm our data is present repo.checkout(test_branch) womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list( mens_data['name'])
def test_insert_unique_key(init_empty_test_repo): repo = init_empty_test_repo def generate_data(): return pd.DataFrame({'id': [1, 1, 2], 'value': ['foo', 'foo', 'baz']}) test_table = 'test_data' get_dolt_loader([ get_df_table_writer(test_table, generate_data, ['hash_id'], transformers=[insert_unique_key]) ], True, 'Updating test data')(repo) result = read_table(repo, test_table) assert result.loc[result['id'] == 1, 'count'].iloc[0] == 2 and 'hash_id' in result.columns
def inner(repo: Dolt): _transformers = transformers + [insert_unique_key ] if transformers else [ insert_unique_key ] data = _apply_df_transformers(get_data(), _transformers) if table not in [t.name for t in repo.ls()]: raise ValueError('Missing table') # Get existing PKs existing = read_table(repo, table) existing_pks = existing[INSERTED_ROW_HASH_COL].to_list() # Get proposed PKs proposed_pks = data[INSERTED_ROW_HASH_COL].to_list() to_drop = [ existing for existing in existing_pks if existing not in proposed_pks ] if to_drop: iterator = iter(to_drop) while iterator: batch = list(itertools.islice(iterator, 30000)) if len(batch) == 0: break logger.info('Dropping batch of {} IDs from table {}'.format( len(batch), table)) drop_statement = ''' DELETE FROM {table} WHERE {pk} in ("{pks_to_drop}") '''.format(table=table, pk=INSERTED_ROW_HASH_COL, pks_to_drop='","'.join(batch)) repo.sql(query=drop_statement) new_data = data[~(data[INSERTED_ROW_HASH_COL].isin(existing_pks))] if not new_data.empty: logger.info('Importing {} records'.format(len(new_data))) import_df(repo, table, new_data, [INSERTED_ROW_HASH_COL], 'update') return table
def test_get_bulk_table_loader(init_empty_test_repo): repo = init_empty_test_repo table = 'test_table' def get_data(): return io.StringIO(CORRUPT_CSV) def cleaner(data: io.StringIO) -> io.StringIO: output = io.StringIO() header_line = data.readline() columns = header_line.split(',') output.write(header_line) for l in data.readlines(): if len(l.split(',')) != len(columns): print('Corrupt line, discarding:\n{}'.format(l)) else: output.write(l) output.seek(0) return output get_bulk_table_writer(table, get_data, ['player_name'], import_mode=CREATE, transformers=[cleaner])(repo) actual = read_table(repo, table) expected = io.StringIO(CLEANED_CSV) headers = [col.rstrip() for col in expected.readline().split(',')] assert all(headers == actual.columns) players_to_week_counts = actual.set_index( 'player_name')['weeks_at_number_1'].to_dict() for line in expected.readlines(): player_name, weeks_at_number_1 = line.split(',') assert (player_name in players_to_week_counts and players_to_week_counts[player_name] == int( weeks_at_number_1.rstrip()))
def get_raw_fx_rates(repo: Dolt): return read.read_table(repo, 'eur_fx_rates')
def test_table_transfomer_create(initial_derived_data): repo = initial_derived_data avg_df = read_table(repo, AVERAGE_MAJOR_COUNT) assert avg_df.loc[avg_df['gender'] == 'mens', 'average'].iloc[0] == 20 assert avg_df.loc[avg_df['gender'] == 'womens', 'average'].iloc[0] == 23
def get_raw_data(repo: Dolt): return pd.concat([ read_table(repo, MENS_MAJOR_COUNT).assign(gender='mens'), read_table(repo, WOMENS_MAJOR_COUNT).assign(gender='womens') ])
CHROME, '--headless', '--disable-gpu', '--dump-dom', '--crash-dumps-dir=/tmp', url ] process = Popen(headless_chrome, stdout=PIPE) (output, err) = process.communicate() exit_code = process.wait() return output repo_name = 'Liquidata/online-services' root = '.' repo = Dolt.clone(repo_name, root) documents_df = read_table(repo, 'documents') documents_df['terms_raw'] = documents_df['terms_raw'].astype(str) documents_df['privacy_raw'] = documents_df['privacy_raw'].astype(str) for index, row in documents_df.iterrows(): print(f'Processing {index}') documents_df.at[index, 'terms_raw'] = scrape_document(row['terms_url']) documents_df.at[index, 'privacy_raw'] = scrape_document(row['privacy_url']) import_df(repo, 'documents', documents_df, ['product_id']) if repo.status().is_clean: print('No changes to repo. Exiting') else: print('Commiting and pushing to DoltHub') repo.add('documents')