コード例 #1
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_dataframe_table_loader_create(initial_test_data):
    repo = initial_test_data

    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert womens_data.iloc[0]['name'] == 'Serena'
    assert mens_data.iloc[0]['name'] == 'Roger'
コード例 #2
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_dataframe_table_loader_update(update_test_data):
    repo = update_test_data

    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name'])
    assert 'Rafael' in list(mens_data['name'])
コード例 #3
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_get_unique_key_update_writer(init_empty_test_repo):
    repo = init_empty_test_repo

    def generate_initial_data():
        return pd.DataFrame([{
            'name': 'Roger',
            'id': 1
        }, {
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Novak',
            'id': 3
        }])

    test_table = 'test_data'
    get_dolt_loader([
        get_unique_key_table_writer(
            test_table, generate_initial_data, import_mode='create')
    ], True, 'Create test data')(repo)

    # Test that we have what we expect
    data = read_table(repo, test_table)
    assert [
        data.loc[data['name'] == player, 'count'].iloc[0] == 1
        for player in ['Roger', 'Novak']
    ]
    assert data.loc[data['name'] == 'Rafael', 'count'].iloc[0] == 2

    def generate_updated_data():
        return pd.DataFrame([{
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Novak',
            'id': 3
        }, {
            'name': 'Andy',
            'id': 4
        }])

    get_dolt_loader(
        [get_unique_key_table_writer(test_table, generate_updated_data)], True,
        'Updating data')(repo)
    data = read_table(repo, test_table)
    assert [
        data.loc[data['name'] == player, 'count'].iloc[0] == 1
        for player in ['Rafael', 'Novak', 'Andy']
    ]
コード例 #4
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_table_transfomer_update(update_derived_data):
    repo = update_derived_data
    avg_df = read_table(repo, AVERAGE_MAJOR_COUNT)
    assert avg_df.loc[avg_df['gender'] == 'mens',
                      'average'].iloc[0] == (20 + 19) / 2
    assert avg_df.loc[avg_df['gender'] == 'womens',
                      'average'].iloc[0] == (23 + 24) / 2
コード例 #5
0
    def read_table(self,
                   table_name: str,
                   commit: str = None,
                   flow_name: str = None,
                   run_id: str = None) -> pd.DataFrame:
        """
        Returns the specified tables as a DataFrame.
        """
        if not current.is_running_flow:
            raise ValueError("read_table is only supported in a running Flow")

        read_meta = self._get_table_read(table_name)

        if commit:
            table = self._get_dolt_table_asof(self.doltdb, table_name, commit)
            read_meta.commit = commit
        elif flow_name and run_id:
            df = read_table_sql(self.meta_doltdb,
                                _get_actions_query(flow_name, run_id, 'read'))
            database = df.database.values[0]
            commit = df.commit.values[0]
            # checkout database and get table ASOF commit
            db = Dolt(database)
            table = self._get_dolt_table_asof(db, table_name, commit)
            read_meta.commit = commit
        else:
            table = read_table(self.doltdb, table_name)
            read_meta.commit = self._get_latest_commit_hash()
        self.table_reads.append(read_meta)
        return table
コード例 #6
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_branching(initial_test_data):
    repo = initial_test_data
    test_branch = 'new-branch'
    repo.branch(branch_name=test_branch)
    repo.checkout(test_branch)
    _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch)

    current_branch, _ = repo.branch()
    assert current_branch.name == test_branch
    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name'])
    assert 'Rafael' in list(mens_data['name'])

    repo.checkout('master')
    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert 'Margaret' not in list(womens_data['name'])
    assert 'Rafael' not in list(mens_data['name'])
コード例 #7
0
def test_sql(create_test_table):
    repo, test_table = create_test_table
    sql = '''
        INSERT INTO {table} (name, id)
        VALUES ('Roger', 3)
    '''.format(table=test_table)
    repo.sql(query=sql)

    test_data = read_table(repo, test_table)
    assert 'Roger' in test_data['name'].to_list()
コード例 #8
0
ファイル: app.py プロジェクト: dolthub/dolt-rest-example
def read():
    payload = request.get_json()

    table, table_error = _extract_parameter(payload, 'table', str)
    branch, branch_error = _extract_parameter(payload, 'branch', str)

    # validate that we don't have errors

    with DoltCheckoutContext(DOLT, branch, False):
        data = dolt_read.read_table(DOLT, table).to_dict('rows')
        return jsonify(data)
コード例 #9
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_multi_branch_load(initial_test_data):
    repo = initial_test_data
    first_branch, second_branch = 'first-branch', 'second-branch'

    _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, first_branch)
    _populate_test_data_helper(repo, SECOND_UPDATE_MENS, SECOND_UPDATE_WOMENS,
                               second_branch)

    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert 'Margaret' not in list(
        womens_data['name']) and 'Rafael' not in list(mens_data['name'])
    assert 'Steffi' not in list(womens_data['name']) and 'Novak' not in list(
        mens_data['name'])

    repo.checkout(first_branch)
    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list(
        mens_data['name'])

    repo.checkout(second_branch)
    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert 'Steffi' in list(womens_data['name']) and 'Novak' in list(
        mens_data['name'])
コード例 #10
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_load_to_dolt_new_branch(initial_test_data):
    repo = initial_test_data
    test_branch = 'new-branch'

    # check we have only the expected branches in the sample data
    _, branches = repo.branch()
    assert [b.name for b in branches] == ['master']

    # load some data to a new branch
    _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch)

    # check that we are still on the branch we started on
    current_branch, current_branches = repo.branch()
    assert current_branch.name == 'master' and [
        b.name for b in current_branches
    ] == ['master', test_branch]

    # check out our new branch and confirm our data is present
    repo.checkout(test_branch)
    womens_data, mens_data = read_table(repo, WOMENS_MAJOR_COUNT), read_table(
        repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list(
        mens_data['name'])
コード例 #11
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_insert_unique_key(init_empty_test_repo):
    repo = init_empty_test_repo

    def generate_data():
        return pd.DataFrame({'id': [1, 1, 2], 'value': ['foo', 'foo', 'baz']})

    test_table = 'test_data'
    get_dolt_loader([
        get_df_table_writer(test_table,
                            generate_data, ['hash_id'],
                            transformers=[insert_unique_key])
    ], True, 'Updating test data')(repo)
    result = read_table(repo, test_table)
    assert result.loc[result['id'] == 1,
                      'count'].iloc[0] == 2 and 'hash_id' in result.columns
コード例 #12
0
    def inner(repo: Dolt):
        _transformers = transformers + [insert_unique_key
                                        ] if transformers else [
                                            insert_unique_key
                                        ]
        data = _apply_df_transformers(get_data(), _transformers)
        if table not in [t.name for t in repo.ls()]:
            raise ValueError('Missing table')

        # Get existing PKs
        existing = read_table(repo, table)
        existing_pks = existing[INSERTED_ROW_HASH_COL].to_list()

        # Get proposed PKs
        proposed_pks = data[INSERTED_ROW_HASH_COL].to_list()
        to_drop = [
            existing for existing in existing_pks
            if existing not in proposed_pks
        ]

        if to_drop:
            iterator = iter(to_drop)
            while iterator:
                batch = list(itertools.islice(iterator, 30000))
                if len(batch) == 0:
                    break

            logger.info('Dropping batch of {} IDs from table {}'.format(
                len(batch), table))
            drop_statement = '''
            DELETE FROM {table} WHERE {pk} in ("{pks_to_drop}")
            '''.format(table=table,
                       pk=INSERTED_ROW_HASH_COL,
                       pks_to_drop='","'.join(batch))
            repo.sql(query=drop_statement)

        new_data = data[~(data[INSERTED_ROW_HASH_COL].isin(existing_pks))]
        if not new_data.empty:
            logger.info('Importing {} records'.format(len(new_data)))
            import_df(repo, table, new_data, [INSERTED_ROW_HASH_COL], 'update')

        return table
コード例 #13
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_get_bulk_table_loader(init_empty_test_repo):
    repo = init_empty_test_repo
    table = 'test_table'

    def get_data():
        return io.StringIO(CORRUPT_CSV)

    def cleaner(data: io.StringIO) -> io.StringIO:
        output = io.StringIO()
        header_line = data.readline()
        columns = header_line.split(',')
        output.write(header_line)
        for l in data.readlines():
            if len(l.split(',')) != len(columns):
                print('Corrupt line, discarding:\n{}'.format(l))
            else:
                output.write(l)

        output.seek(0)
        return output

    get_bulk_table_writer(table,
                          get_data, ['player_name'],
                          import_mode=CREATE,
                          transformers=[cleaner])(repo)
    actual = read_table(repo, table)
    expected = io.StringIO(CLEANED_CSV)
    headers = [col.rstrip() for col in expected.readline().split(',')]
    assert all(headers == actual.columns)
    players_to_week_counts = actual.set_index(
        'player_name')['weeks_at_number_1'].to_dict()
    for line in expected.readlines():
        player_name, weeks_at_number_1 = line.split(',')
        assert (player_name in players_to_week_counts
                and players_to_week_counts[player_name] == int(
                    weeks_at_number_1.rstrip()))
コード例 #14
0
def get_raw_fx_rates(repo: Dolt):
    return read.read_table(repo, 'eur_fx_rates')
コード例 #15
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def test_table_transfomer_create(initial_derived_data):
    repo = initial_derived_data
    avg_df = read_table(repo, AVERAGE_MAJOR_COUNT)
    assert avg_df.loc[avg_df['gender'] == 'mens', 'average'].iloc[0] == 20
    assert avg_df.loc[avg_df['gender'] == 'womens', 'average'].iloc[0] == 23
コード例 #16
0
ファイル: test_tools.py プロジェクト: lucab/doltpy
def get_raw_data(repo: Dolt):
    return pd.concat([
        read_table(repo, MENS_MAJOR_COUNT).assign(gender='mens'),
        read_table(repo, WOMENS_MAJOR_COUNT).assign(gender='womens')
    ])
コード例 #17
0
        CHROME, '--headless', '--disable-gpu', '--dump-dom',
        '--crash-dumps-dir=/tmp', url
    ]

    process = Popen(headless_chrome, stdout=PIPE)
    (output, err) = process.communicate()
    exit_code = process.wait()

    return output


repo_name = 'Liquidata/online-services'
root = '.'
repo = Dolt.clone(repo_name, root)

documents_df = read_table(repo, 'documents')
documents_df['terms_raw'] = documents_df['terms_raw'].astype(str)
documents_df['privacy_raw'] = documents_df['privacy_raw'].astype(str)

for index, row in documents_df.iterrows():
    print(f'Processing {index}')
    documents_df.at[index, 'terms_raw'] = scrape_document(row['terms_url'])
    documents_df.at[index, 'privacy_raw'] = scrape_document(row['privacy_url'])

import_df(repo, 'documents', documents_df, ['product_id'])

if repo.status().is_clean:
    print('No changes to repo. Exiting')
else:
    print('Commiting and pushing to DoltHub')
    repo.add('documents')