Example #1
0
def _import_helper(repo: Dolt, table_name: str,
                   write_import_file: Callable[[str], None],
                   primary_keys: List[str], import_mode: str) -> None:
    import_modes = IMPORT_MODES_TO_FLAGS.keys()
    if import_mode is not None:
        assert import_mode in import_modes, 'update_mode must be one of: {}'.format(
            import_modes)
    else:
        if table_name in [table.name for table in repo.ls()]:
            logger.info(
                'No import mode specified, table exists, using "{}"'.format(
                    UPDATE))
            import_mode = UPDATE
        else:
            logger.info(
                'No import mode specified, table exists, using "{}"'.format(
                    CREATE))
            import_mode = CREATE

    import_flags = IMPORT_MODES_TO_FLAGS[import_mode]
    logger.info(
        'Importing to table {} in dolt directory located in {}, import mode {}'
        .format(table_name, repo.repo_dir(), import_mode))
    fp = tempfile.NamedTemporaryFile(suffix='.csv')
    write_import_file(fp.name)
    args = [
        'table', 'import', table_name, '--pk={}'.format(','.join(primary_keys))
    ] + import_flags
    repo.execute(args + [fp.name])
Example #2
0
def test_init_new_repo(tmp_path):
    repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path)
    assert not os.path.exists(repo_data_dir)
    dolt = Dolt(repo_path)
    dolt.init_new_repo()
    assert os.path.exists(repo_data_dir)
    shutil.rmtree(repo_data_dir)
Example #3
0
def init_repo(tmp_path) -> Dolt:
    repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path)
    assert not os.path.exists(repo_data_dir)
    repo = Dolt(repo_path)
    repo.init_new_repo()
    yield repo
    if os.path.exists(repo_data_dir):
        shutil.rmtree(repo_data_dir)
Example #4
0
    def inner(table_name: str, repo: Dolt) -> DoltTableUpdate:
        if branch and branch != repo.log():
            repo.checkout(branch)

        query_commit = commit_ref or list(repo.log().keys())[0]
        table = get_table_metadata(repo.engine, table_name)
        from_commit, to_commit = get_from_commit_to_commit(repo, query_commit)
        pks_to_drop = get_dropped_pks(repo.engine, table, from_commit,
                                      to_commit)
        result = _read_from_dolt_history(repo.engine, table, query_commit)
        return pks_to_drop, result
Example #5
0
    def inner(repo: Dolt):
        _, current_branches = repo.branch()
        branches = [branch.name for branch in current_branches]
        assert new_branch_name not in branches, 'Branch {} already exists'.format(
            new_branch_name)
        logger.info(
            'Creating new branch on repo in {} named {} at refspec {}'.format(
                repo.repo_dir, new_branch_name, refspec))
        repo.branch(new_branch_name)

        return new_branch_name
Example #6
0
def write_to_table(repo: Dolt,
                   table: Table,
                   data: List[dict],
                   commit: bool = False,
                   message: str = None):
    """
    Given a repo, table, and data, will try and use the repo's MySQL Server instance to write the provided data to the
    table. Since Dolt does not yet support ON DUPLICATE KEY clause to INSERT statements we also have to separate
    updates from inserts and run sets of statements.
    :param repo:
    :param table:
    :param data:
    :param commit:
    :param message:
    :return:
    """
    coerced_data = list(clean_types(data))
    inserts, updates = get_inserts_and_updates(repo.engine, table,
                                               coerced_data)
    if inserts:
        logger.info('Inserting {} rows'.format(len(inserts)))
        with repo.engine.connect() as conn:
            conn.execute(table.insert(), inserts)

    # We need to prefix the columns with "_" in order to use bindparam properly
    from copy import deepcopy
    _updates = deepcopy(updates)
    for dic in _updates:
        for col in list(dic.keys()):
            dic['_{}'.format(col)] = dic.pop(col)

    if _updates:
        logger.info('Updating {} rows'.format(len(_updates)))
        with repo.engine.connect() as conn:
            statement = table.update()
            for pk_col in [
                    col.name for col in table.columns if col.primary_key
            ]:
                statement = statement.where(
                    table.c[pk_col] == bindparam('_{}'.format(pk_col)))
            non_pk_cols = [
                col.name for col in table.columns if not col.primary_key
            ]
            statement = statement.values(
                {col: bindparam('_{}'.format(col))
                 for col in non_pk_cols})
            conn.execute(statement, _updates)

    if commit:
        repo.add(str(table.name))
        message = message or 'Inserting {} records at '.format(
            len(data), datetime.now())
        repo.commit(message)
Example #7
0
def test_config_local(init_empty_test_repo):
    repo = init_empty_test_repo
    current_global_config = Dolt.config_global(list=True)
    test_username, test_email = 'test_user', 'test_email'
    repo.config_local(add=True, name='user.name', value=test_username)
    repo.config_local(add=True, name='user.email', value=test_email)
    local_config = repo.config_local(list=True)
    global_config = Dolt.config_global(list=True)
    assert local_config['user.name'] == test_username and local_config[
        'user.email'] == test_email
    assert global_config['user.name'] == current_global_config['user.name']
    assert global_config['user.email'] == current_global_config['user.email']
Example #8
0
def read_table(repo: Dolt, table_name: str, delimiter: str = ',') -> pd.DataFrame:
    """
    Reads the contents of a table and returns it as a Pandas `DataFrame`. Under the hood this uses export and the
    filesystem, in short order we are likley to replace this with use of the MySQL Server.
    :param repo:
    :param table_name:
    :param delimiter:
    :return:
    """
    fp = tempfile.NamedTemporaryFile(suffix='.csv')
    repo.execute(['table', 'export', table_name, fp.name, '-f'])
    result = pd.read_csv(fp.name, delimiter=delimiter)
    return result
Example #9
0
    def inner(table_name: str, repo: Dolt) -> DoltTableUpdate:
        current_branch, _ = repo.branch()
        if branch and branch != current_branch:
            repo.checkout(branch)

        from_commit, to_commit = get_from_commit_to_commit(repo, commit_ref)
        metadata = MetaData(bind=repo.engine)
        metadata.reflect()
        table = metadata.tables[table_name]
        pks_to_drop = get_dropped_pks(repo.engine, table, from_commit,
                                      to_commit)
        result = _read_from_dolt_diff(repo.engine, table, from_commit,
                                      to_commit)
        return pks_to_drop, result
Example #10
0
def init_empty_test_repo(tmp_path) -> Dolt:
    repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path)
    assert not os.path.exists(repo_data_dir)
    repo = Dolt.init(repo_path, ServerConfig(loglevel='trace', timeout=1000000))
    yield repo
    if os.path.exists(repo_data_dir):
        shutil.rmtree(repo_data_dir)
Example #11
0
def init_other_empty_test_repo(tmp_path) -> Dolt:
    repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path, 'other')
    assert not os.path.exists(repo_data_dir)
    os.mkdir(repo_path)
    repo = Dolt.init(repo_path, ServerConfig(port=3307))
    yield repo
    if os.path.exists(repo_data_dir):
        shutil.rmtree(repo_data_dir)
Example #12
0
 def inner(repo: Dolt):
     _import_mode = import_mode or (
         'create' if table not in [t.name for t in repo.ls()] else 'update')
     data_to_load = _apply_file_transformers(get_data(), transformers)
     bulk_import(repo,
                 table,
                 data_to_load,
                 pk_cols,
                 import_mode=_import_mode)
     return table
Example #13
0
    def inner(repo: Dolt):
        _transformers = transformers + [insert_unique_key
                                        ] if transformers else [
                                            insert_unique_key
                                        ]
        data = _apply_df_transformers(get_data(), _transformers)
        if table not in [t.name for t in repo.ls()]:
            raise ValueError('Missing table')

        # Get existing PKs
        existing = read_table(repo, table)
        existing_pks = existing[INSERTED_ROW_HASH_COL].to_list()

        # Get proposed PKs
        proposed_pks = data[INSERTED_ROW_HASH_COL].to_list()
        to_drop = [
            existing for existing in existing_pks
            if existing not in proposed_pks
        ]

        if to_drop:
            iterator = iter(to_drop)
            while iterator:
                batch = list(itertools.islice(iterator, 30000))
                if len(batch) == 0:
                    break

            logger.info('Dropping batch of {} IDs from table {}'.format(
                len(batch), table))
            drop_statement = '''
            DELETE FROM {table} WHERE {pk} in ("{pks_to_drop}")
            '''.format(table=table,
                       pk=INSERTED_ROW_HASH_COL,
                       pks_to_drop='","'.join(batch))
            repo.sql(query=drop_statement)

        new_data = data[~(data[INSERTED_ROW_HASH_COL].isin(existing_pks))]
        if not new_data.empty:
            logger.info('Importing {} records'.format(len(new_data)))
            import_df(repo, table, new_data, [INSERTED_ROW_HASH_COL], 'update')

        return table
Example #14
0
def sync_schema_to_dolt(source_engine: Engine, repo: Dolt, table_map: Mapping[str, str], type_mapping: dict):
    """

    :param source_engine:
    :param repo:
    :param table_map:
    :param type_mapping:
    :return:
    """
    source_metadata = MetaData(bind=source_engine)
    source_metadata.reflect()
    target_metadata = MetaData(bind=repo.get_engine())
    target_metadata.reflect()
    for source_table_name, target_table_name in table_map.items():
        source_table = source_metadata.tables[source_table_name]
        target_table = coerce_schema_to_dolt(target_table_name, source_table, type_mapping)
        if target_table_name in target_metadata.tables.keys():
            target_table.drop(repo.get_engine())

        target_table.create(repo.get_engine())
Example #15
0
def _import_helper(repo: Dolt, table_name: str,
                   write_import_file: Callable[[str], None],
                   primary_keys: List[str], import_mode: str) -> None:
    import_modes = IMPORT_MODES_TO_FLAGS.keys()
    if import_mode is not None:
        assert import_mode in import_modes, 'update_mode must be one of: {}'.format(
            import_modes)
    else:
        if table_name in [table.name for table in repo.ls()]:
            logger.info(
                'No import mode specified, table exists, using "{}"'.format(
                    UPDATE))
            import_mode = UPDATE
        else:
            logger.info(
                'No import mode specified, table exists, using "{}"'.format(
                    CREATE))
            import_mode = CREATE

    if import_mode == CREATE and primary_keys is None:
        raise ValueError(
            'Import mode CREATE requires a primary key to be specified')

    import_flags = IMPORT_MODES_TO_FLAGS[import_mode]
    logger.info(
        'Importing to table {} in dolt directory located in {}, import mode {}'
        .format(table_name, repo.repo_dir(), import_mode))

    fname = tempfile.mktemp(suffix='.csv')
    try:
        write_import_file(fname)
        args = ['table', 'import', table_name] + import_flags
        if import_mode == CREATE:
            args += ['--pk={}'.format(','.join(primary_keys))]

        repo.execute(args + [fname])
    finally:
        if os.path.exists(fname):
            os.remove(fname)
Example #16
0
def _create_table_inferred(repo: Dolt, table_name: str,
                           data: Mapping[str,
                                         List[Any]], primary_keys: List[str]):
    # generate and execute a create table statement
    cols_to_types = {}
    for col_name, list_of_values in data.items():
        # Just take the first value to by the type
        first_non_null = None
        for val in list_of_values:
            if val is not None:
                first_non_null = val
                break
            raise ValueError(
                'Cannot provide an empty list, types cannot be inferred')
        cols_to_types[col_name] = _get_col_type(first_non_null, list_of_values)

    metadata = MetaData(bind=repo.get_engine())
    table = _get_table_def(metadata, table_name, cols_to_types, primary_keys)
    table.create()
Example #17
0
def _test_dolt_table_reader_helper(repo: Dolt,
                                   table: Table,
                                   build_table_reader: Callable[[str], Callable[[str, Dolt], DoltTableUpdate]],
                                   get_expected: Callable[[int], Tuple[List[dict], List[dict]]]):
    commits = list(repo.log().keys())
    update_to_commit = {
        FIRST_UPDATE: commits[4],
        SECOND_UPDATE: commits[3],
        THIRD_UPDATE: commits[2],
        FOURTH_UPDATE: commits[1],
        FIFTH_UPDATE: commits[0]
    }

    for update_num, commit in update_to_commit.items():
        logger.info('comparison for commit/update_num {}/{}'.format(commit, update_num))
        dropped_pks, dolt_data = build_table_reader(commit)(str(table.name), repo)
        expected_dropped_pks, expected_data = get_expected(update_num)
        assert expected_dropped_pks == dropped_pks
        assert_rows_equal(expected_data, list(dolt_data))
Example #18
0
def get_from_commit_to_commit(repo: Dolt,
                              commit_ref: str = None) -> Tuple[str, str]:
    """
    Given a repo and commit it returns the commit and its parent, if no commit is provided the head and the parent of
    head are returned.
    :param repo:
    :param commit_ref:
    :return:
    """
    commits = list(repo.log().keys())
    commit_ref_index = None
    if not commit_ref:
        commit_ref_index = 0
    else:
        for i, commit in enumerate(commits):
            if commit == commit_ref:
                commit_ref_index = i
                break
    assert commit_ref_index is not None, 'commit_ref not found in commit index'
    return commits[commit_ref_index + 1], commits[commit_ref_index]
Example #19
0
def _create_table_from_schema_import_helper(
        repo: Dolt,
        table: str,
        pks: List[str],
        path: str,
        transformers: List[DataframeTransformer] = None,
        commit: bool = True,
        commit_message: str = None):
    if transformers:
        fp = tempfile.NamedTemporaryFile(suffix='.csv')
        temp = pd.read_csv(path)
        transformed = _apply_df_transformers(temp, transformers)
        transformed.to_csv(fp.name, index=False)
        path = fp.name

    repo.schema_import(table=table, pks=pks, filename=path, create=True)

    if commit:
        message = commit_message or 'Creating table {}'.format(table)
        repo.add(table)
        repo.commit(message)
Example #20
0
def test_bad_repo_path(tmp_path):
    bad_repo_path = tmp_path
    with pytest.raises(AssertionError):
        Dolt(bad_repo_path)
Example #21
0
def test_config_global(init_empty_test_repo):
    _ = init_empty_test_repo
    current_global_config = Dolt.config_global(list=True)
    test_username, test_email = 'test_user', 'test_email'
    Dolt.config_global(add=True, name='user.name', value=test_username)
    Dolt.config_global(add=True, name='user.email', value=test_email)
    updated_config = Dolt.config_global(list=True)
    assert updated_config['user.name'] == test_username and updated_config[
        'user.email'] == test_email
    Dolt.config_global(add=True,
                       name='user.name',
                       value=current_global_config['user.name'])
    Dolt.config_global(add=True,
                       name='user.email',
                       value=current_global_config['user.email'])
    reset_config = Dolt.config_global(list=True)
    assert reset_config['user.name'] == current_global_config['user.name']
    assert reset_config['user.email'] == current_global_config['user.email']
Example #22
0
def _verify_branches(repo: Dolt, branch_list: List[str]):
    _, branches = repo.branch()
    assert set(branch.name
               for branch in branches) == set(branch for branch in branch_list)
Example #23
0
    "Joseph R. Biden and\nKamala D. Harris (D)": "JOSEPH BIDEN",
    "Jo Jorgensen and\nJeremy \"Spike\" Cohen (L)": "JO JORGENSEN",
    "Donald J. Trump and\nMichael R. Pence (R)": "DONALD TRUMP",
    "Brock Pierce and\nKarla Ballard (I)": "BROCK PIERCE",
    "Write-Ins": "WRITE-IN"
}

party_chart: dict = {
    "JOSEPH BIDEN": "DEMOCRATIC",
    "JO JORGENSEN": "LIBERTARIAN",
    "DONALD TRUMP": "REPUBLICAN",
    "BROCK PIERCE": "NEW YORK INDEPENDENCE PARTY",
    "WRITE-IN": "None"
}

repo = Dolt("working/us-president-precinct-results")
sql_file = open(
    "working/us-president-precinct-results/sql-import-me-wyoming.sql",
    mode="w")

wyoming_precincts: List[str] = []
with open("./working/us-president-precinct-results/wyoming-precincts.csv",
          'r') as f:
    for line in f:
        # print(line, end='')
        wyoming_precincts.append(line)

for file in os.listdir("working/Wyoming_General_CSV/"):
    if "County" not in file:
        continue
Example #24
0
def get_raw_data(repo: Dolt):
    return pd.concat([
        repo.read_table(MENS_MAJOR_COUNT).assign(gender='mens'),
        repo.read_table(WOMENS_MAJOR_COUNT).assign(gender='womens')
    ])
Example #25
0
def import_dict(repo: Dolt,
                table_name: str,
                data: Mapping[str, List[Any]],
                primary_keys: List[str] = None,
                import_mode: str = None,
                batch_size: int = DEFAULT_BATCH_SIZE):
    """
    Provides a column major interface for writing Python data structures to Dolt, specifically data should be a dict
    where the keys are column names and the values are equal length lists of values to be written to Dolt. The lists
    must consist of:
        - values that match the type of the table in the schema of the table being written to
        - values of the same type that can be coalesced to a Python type by the (very limited) type inference logic
          for generating a schema from a data structure

    Note it is necessary for all list to be of the same length since we must coalesce the lists into rows, and that
    doesn't really make sense when the lists are not of the same length.

    Let's proceed with the example of creating a simple table and showing how to write some data structures:
        CREATE TABLE players (id INT, name VARCHAR(16), PRIMARY KEY (id))

    Now write in update mode:
    >>> dict_of_lists = {'id': [1, 2], 'name': ['Roger', 'Rafael']}
    >>> import_dict(repo, 'players', dict_of_lists, import_mode='update')

    Alternatively we can let the Python code infer a schema:
    >>> import_dict(repo, 'players', dict_of_lists, ['id'], import_mode='create')

    Assertions:
        - all list values are of equal length
        - when inferring a schema each list value has elements of a type that can be mapped to a SQL type, the logic is
          currently very limited
        - when inferring a schema

    This function requires the Dolt SQL server to be running on the host and port provided, defaulting to
    127.0.0.1:3306.

    :param repo:
    :param table_name:
    :param data:
    :param primary_keys:
    :param import_mode:
    :param batch_size:
    :return:
    """
    assert import_mode in [UPDATE, CREATE]

    # Grab some basic information about the data
    assert data, 'Cannot provide an empty dictionary'
    row_count = len(list(data.values())[0])
    assert row_count > 0, 'Must provide at least a single row'
    assert all(len(val_list) == row_count for val_list in
               data.values()), 'Must provide value lists of uniform length'

    # Get an Engine object

    # If the table does not exist, create it using type inference to build a create statement
    if import_mode == CREATE:
        assert primary_keys, 'primary_keys need to be provided when inferring a schema'
        _create_table_inferred(repo, table_name, data, primary_keys)

    rows = []
    for i in range(row_count):
        rows.append({col: data[col][i] for col in data.keys()})

    clean_rows = coerce_dates(rows)

    logger.info('Inserting {row_count} rows into table {table_name}'.format(
        row_count=row_count, table_name=table_name))

    metadata = MetaData(bind=repo.get_engine())
    metadata.reflect()
    table = metadata.tables[table_name]

    for i in range(max(1, math.ceil(len(clean_rows) / batch_size))):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(clean_rows))
        batch = clean_rows[batch_start:batch_end]
        logger.info('Writing records {} through {} of {} rows to Dolt'.format(
            batch_start, batch_end, len(clean_rows)))
        with repo.get_engine().connect() as conn:
            conn.execute(table.insert(), batch)
Example #26
0
    def inner(repo: Dolt):
        current_branch, current_branch_list = repo.branch()
        original_branch = current_branch.name

        if branch != original_branch and not commit:
            raise ValueError(
                'If writes are to another branch, and commit is not True, writes will be lost'
            )

        if current_branch.name != branch:
            logger.info('Current branch is {}, checking out {}'.format(
                current_branch.name, branch))
            if branch not in [b.name for b in current_branch_list]:
                logger.info('{} does not exist, creating'.format(branch))
                repo.branch(branch_name=branch)
            repo.checkout(branch)

        if transaction_mode:
            raise NotImplementedError(
                'transaction_mode is not yet implemented')

        tables_updated = [writer(repo) for writer in writers]

        if commit:
            if not repo.status().is_clean:
                logger.info(
                    'Committing to repo located in {} for tables:\n{}'.format(
                        repo.repo_dir, tables_updated))
                for table in tables_updated:
                    repo.add(table)
                repo.commit(message)

            else:
                logger.warning('No changes to repo in:\n{}'.format(
                    repo.repo_dir))

        current_branch, branches = repo.branch()
        if original_branch != current_branch.name:
            logger.info(
                'Checked out {} from {}, checking out {} to restore state'.
                format([b.name for b in branches], original_branch,
                       original_branch))
            repo.checkout(original_branch)

        return branch
Example #27
0
def _dolt_table_read_helper(repo: Dolt, table_name: str):
    table = get_table_metadata(repo.get_engine(), table_name)
    with repo.get_engine().connect() as conn:
        result = conn.execute(table.select())
        return [dict(row) for row in result]