Ejemplo n.º 1
0
def run_command(command, env=None):
    "Runs a shell command and streams output to the log"

    env = env or {}

    with subprocess.Popen(
            shlex.split(command),
            env={
                **os.environ,
                **env
            },
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
    ) as proc:
        while True:
            output = proc.stdout.readline().decode("utf-8")
            if len(output) == 0 and proc.poll() is not None:
                break
            if output:
                LOG.info(output.strip())

        return_code = proc.poll()
    if return_code != 0:
        raise RunCommandError(
            "Error running shell command, please see log for details.")
Ejemplo n.º 2
0
def clean_target_test_data(engine, api):
    "Removes target data from the test database that might be left over from a previous test run"
    insp = reflection.Inspector.from_engine(engine)

    namespaces = {
        target.split(".")[1]
        for target in api.spec["targets"].keys()
    }
    for namespace in namespaces:
        execute_sqls(engine, [f"CREATE SCHEMA IF NOT EXISTS {namespace}"])

        tables = engine.table_names(schema=namespace)
        LOG.debug("Found existing tables: %s", tables)

        views = list(insp.get_view_names(schema=namespace))
        LOG.debug("Found existing views: %s", views)

        targets = api.spec["targets"].keys()
        target_tables = [
            target for target in targets if target.split(".")[-1] in tables
        ]
        target_views = [
            target for target in targets if target.split(".")[-1] in views
        ]

        execute_sqls(
            engine,
            [f"DROP TABLE IF EXISTS {target}" for target in target_tables],
        )

        execute_sqls(
            engine,
            [f"DROP VIEW IF EXISTS {target}" for target in target_views],
        )
Ejemplo n.º 3
0
def load_test_data(source_engines, api, schemas_path):
    "Loads test data generated by dtspec into the test databases"

    schema_metadata = read_sa_metadata(schemas_path)
    source_fqn_to_sa = _source_fqn_to_sa(source_engines, schema_metadata)

    truncate_by_env_sqls = {env: [] for env in source_engines.keys()}
    insert_by_env_sqls = {env: [] for env in source_engines.keys()}
    for source_name, data in api.spec["sources"].items():

        try:
            this_source_meta = source_fqn_to_sa[source_name]
        except KeyError as err:
            raise KeyError(
                f"Unable to find source {source_name} in schema metadata: {source_fqn_to_sa.keys()}"
            ) from err
        source_insert = (this_source_meta["sa_table"].insert(
            bind=this_source_meta["engine"]).values(
                sa_serialize(data.serialize())))

        truncate_by_env_sqls[this_source_meta["env"]].append(
            f"TRUNCATE {source_name}; ")

        if len(data.serialize()) > 0:
            insert_by_env_sqls[this_source_meta["env"]].append(source_insert)

    for env, source_engine in source_engines.items():
        LOG.info("Loading test data into source test environment %s", env)
        execute_sqls(engine=source_engine, sqls=truncate_by_env_sqls[env])

        execute_sqls(engine=source_engine, sqls=insert_by_env_sqls[env])
Ejemplo n.º 4
0
def _init_test_db(config, env=None, clean=False):
    LOG.info("initializing test db env: %s", env)
    env_config = config["source_environments"][env]
    engine = _engine_from_config(env_config["test"])

    dtspec.db.init_test_db(
        env=env, engine=engine, schemas_path=SCHEMAS_PATH, clean=clean
    )
Ejemplo n.º 5
0
def _reflect_table(metadata, engine, namespace, table_name):
    LOG.info("Reflecting table %s.%s", namespace, table_name)
    return sa.Table(
        table_name,
        metadata,
        autoload=True,
        autoload_with=engine,
        schema=namespace,
        resolve_fks=False,
    )
Ejemplo n.º 6
0
def reflect(env, engine, output_path, namespace="public", tables=None):
    "Reflects all specified tables and saves the table schemas as yaml files"

    tables = tables or []
    metadata = sa.MetaData()

    reflected_table_names = _reflect_table_names(engine, namespace)
    selected_table_names = _select_tables(tables, reflected_table_names)
    LOG.debug("Reflecting tables: %s", selected_table_names)

    _reflect_tables(metadata, engine, namespace, selected_table_names)
    _write_yaml(output_path, env, namespace, metadata)
Ejemplo n.º 7
0
def get_actuals(engine, api):
    "Extracts data from the targets of the data transformation and serializes them for comparison with expected values"

    serialized_actuals = {}
    with engine.connect() as conn:
        for target in api.spec["targets"].keys():
            LOG.info("Fetching actual data for target %s", target)
            sa_results = conn.execute(f"SELECT * FROM {target}").fetchall()
            serialized_actuals[target] = {
                "records":
                [{key: _stringify_sa_value(val)
                  for key, val in row.items()} for row in sa_results],
                "columnns":
                list(sa_results[0].keys()),
            }
    return serialized_actuals
Ejemplo n.º 8
0
def read_sa_metadata(schema_path):
    """
    Reads SQLAlchemy schema metadata saved in yaml files.  Returns a dictionary with the
    following structure:
    {
        'environment name 1': {
            'namespace 1': {
                'table 1': sqlalchemy.Table object,
                'table 2': sqlalchemy.Table object,
            },
            'namespace 2': {
                ...
            },
        },
        'environment name 2': {
            ...
        },
    }
    """

    LOG.debug("Reading schema metadata from path %s", schema_path)

    metadata = sa.MetaData()
    schemas = {}
    for yaml_file in glob.glob(os.path.join(schema_path, "*.schema.yml")):
        LOG.debug("Reading schema metadata from %s", yaml_file)
        yaml_basename = os.path.basename(yaml_file)

        parsed_filename = re.search(r"([^.]+).([^.]+).schema.yml",
                                    yaml_basename)
        env = parsed_filename.group(1)
        namespace = parsed_filename.group(2)
        schemas[env] = schemas.get(env, {})

        with open(yaml_file, "r") as yfile:
            yaml_txt = yfile.read()

        schema_def = yaml.unsafe_load(yaml_txt)

        schemas[env][namespace] = {
            table_name: _sa_table_from_yaml(metadata, namespace, table_name,
                                            table_def)
            for table_name, table_def in schema_def.items()
        }

    return schemas
Ejemplo n.º 9
0
def _fetch_schema(config, env):
    LOG.info("fetching schemas for env: %s", env)

    env_config = config["source_environments"][env]
    engine = _engine_from_config(env_config["schema"])

    output_path = os.path.join(DTSPEC_ROOT, "schemas")
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

    for namespace, tables in env_config["tables"].items():
        dtspec.db.reflect(
            env=env,
            engine=engine,
            output_path=output_path,
            namespace=namespace,
            tables=tables,
        )
Ejemplo n.º 10
0
def run_dbt(
    cmd="run",
    profiles_dir=None,
    target="dev",
    models=None,
    exclude=None,
    full_refresh=False,
    env=None,
    partial_parse=False,
):
    "Construct common dbt parameters and runs dbt in a shell"

    profiles_dir = profiles_dir or os.environ.get("DBT_PROFILES_DIR",
                                                  "~/.dbt/")

    env = env or {}

    models_cmd = ""
    if models:
        models_cmd = f"--models {models}"

    exclude_cmd = ""
    if exclude:
        exclude_cmd = f"--exclude {exclude}"

    full_refresh_cmd = ""
    if full_refresh:
        full_refresh_cmd = "--full-refresh"

    partial_parse_cmd = ""
    if partial_parse:
        partial_parse_cmd = "--partial-parse"

    shell_cmd = f"dbt {partial_parse_cmd} {cmd} --profiles-dir={profiles_dir} --target={target} {full_refresh_cmd} {models_cmd} {exclude_cmd}"
    LOG.info("Running dbt via: %s", shell_cmd)

    try:
        run_command(shell_cmd, env=env)
    except RunCommandError as err:
        raise DbtRunError(
            f"dbt failed to {cmd} successfully, please see log for details"
        ) from err
Ejemplo n.º 11
0
 def worker_execute_sqls(engine, worker_sqls):
     with engine.connect().begin() as trans:
         for worker_sql in worker_sqls:
             LOG.debug("Executing sql: %s", worker_sql)
             trans.connection.execute(worker_sql)
         trans.commit()
Ejemplo n.º 12
0
def _get_actuals(config, api, target):
    target_config = config["target_environments"][target]
    engine = _engine_from_config(target_config)
    LOG.info("Fetching results of run from target test environment %s", target)
    return dtspec.db.get_actuals(engine, api)
Ejemplo n.º 13
0
def _clean_target_test_data(config, api, target):
    target_config = config["target_environments"][target]
    engine = _engine_from_config(target_config)
    LOG.info("Cleaning out target test data for target test environment %s", target)
    dtspec.db.clean_target_test_data(engine, api)