Exemple #1
0
def _get_rules_file_keys(
    key_type: str, matches: Dict[str, Dict[str, bool]], rules_file: str,
) -> Dict[str, List[str]]:
    result: Dict[str, List[str]] = {}
    if key_type not in ["primary-key", "replication-key"]:
        raise ValueError(
            f"Unexpected key type '{key_type}'. "
            "Expected: 'replication-key' or 'primary-key'"
        )
    # Check rules_file to fill `matches`
    plan_file_lines = uio.get_text_file_contents(rules_file).splitlines()
    key_overrides = [
        line.split("->")[0].rstrip()
        for line in plan_file_lines
        if "->" in line and line.split("->")[1].lstrip().rstrip() == key_type
    ]
    for key_spec in key_overrides:
        if len(key_spec.split(".")) != 2 or "*" in key_spec:
            raise ValueError(
                f"Expected '{key_type}' indicator with exact two-part key, separated "
                f"by '.'.  Found '{key_spec}'"
            )
        table_name, key_col_name = key_spec.split(".")
        if table_name not in matches:
            raise ValueError(f"Could not locate table '{table_name}' in selected list.")
        if key_col_name not in matches[table_name]:
            raise ValueError(f"Key column '{key_spec}' is not in column list.")
        elif not matches[table_name][key_col_name]:
            raise ValueError(f"Key column '{key_spec}' is not a selected column.")
        result[table_name] = [key_col_name]
    return result
Exemple #2
0
def change_upstream_source(
    dir_to_update=".",
    git_repo="https://github.com/slalom-ggp/dataops-infra",
    branch="master",
    relative_path="../../dataops-infra",
    to_relative=False,
    to_git=False,
    dry_run=False,
):
    """Change Terraform source"""
    if to_relative and to_git or not (to_relative or to_git):
        raise ValueError(
            "Must specify `--to_git` or `--to_relative`, but not both.")
    for tf_file in uio.list_local_files(dir_to_update, recursive=False):
        if tf_file.endswith(".tf"):
            # print(tf_file)
            new_lines = []
            for line in uio.get_text_file_contents(tf_file).splitlines():
                new_line = line
                if line.lstrip().startswith("source "):
                    current_path = line.lstrip().split('"')[1]
                    start_pos = max([
                        current_path.find("catalog/"),
                        current_path.find("components/")
                    ])
                    if start_pos > 0:
                        module_path = current_path[start_pos:].split(
                            "?ref=")[0]
                        if to_relative:
                            local_patten = "{relative_path}/{path}"
                            new_path = local_patten.format(
                                relative_path=relative_path, path=module_path)
                        elif to_git:
                            git_pattern = "git::{git_repo}//{path}?ref={branch}"
                            new_path = git_pattern.format(git_repo=git_repo,
                                                          path=module_path,
                                                          branch=branch)
                        if current_path == new_path:
                            print(f"{current_path} \n\t\t\t-> (unchanged)")
                        else:
                            print(f"{current_path} \n\t\t\t-> {new_path}")
                        new_line = f'  source = "{new_path}"'
                new_lines.append(new_line)
            new_file_text = "\n".join(new_lines)
            if dry_run:
                print(f"\n\n------------\n-- {tf_file}\n------------")
                print(new_file_text)
            else:
                uio.create_text_file(tf_file, new_file_text)
    if not dry_run:
        runnow.run("terraform fmt -recursive", dir_to_update)
Exemple #3
0
def start_jupyter(nb_directory="/home/jovyan/work", nb_token="qwerty123"):
    jupyter_run_command = (
        f"jupyter lab"
        f" --NotebookApp.notebook_dir='{nb_directory}'"
        f" --NotebookApp.token='{nb_token}'"
        f" --allow-root"
    )
    log_file = "jupyter_log.txt"
    runnow.run(jupyter_run_command, daemon=True, log_file_path=log_file)
    time.sleep(5)
    logging.info("\nJUPYTER_LOG:".join(uio.get_text_file_contents(log_file).splitlines()))
    logging.info(
        "Jupyter notebooks server started at: https://localhost:8888/?token=qwerty123"
    )
Exemple #4
0
def _get_plugins_list(
    plugins_index: Optional[str] = None, ) -> List[Tuple[str, str, str]]:
    plugins_index = plugins_index or SINGER_PLUGINS_INDEX
    if not uio.file_exists(plugins_index):
        raise RuntimeError(f"No file found at '{plugins_index}'."
                           "Please set SINGER_PLUGINS_INDEX and try again.")
    yml_doc = yaml.safe_load(uio.get_text_file_contents(plugins_index))
    taps = yml_doc["singer-taps"]
    list_of_tuples = []
    taps = yml_doc["singer-taps"]
    targets = yml_doc["singer-targets"]
    plugins = taps + targets
    for plugin in plugins:
        list_of_tuples.append((
            plugin["name"],
            plugin.get("source", None),
            plugin.get("alias", None),
        ))
    return list_of_tuples
Exemple #5
0
def _discover(
    tap_name: str,
    taps_dir: str,
    *,
    config_file: str,
    catalog_dir: str,
    dockerized: bool,
    tap_exe: str,
) -> None:
    catalog_file = config.get_raw_catalog_file(
        taps_dir, catalog_dir, tap_name, allow_custom=False
    )
    uio.create_folder(catalog_dir)
    img = f"{docker.BASE_DOCKER_REPO}:{tap_exe}"
    hide_cmd = False
    if dockerized:
        cdw = os.getcwd().replace("\\", "/")
        tap_config = json.loads(uio.get_text_file_contents(config_file))
        tap_docker_args = ""
        # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var:
        for k in ["aws_access_key_id", "aws_secret_access_key", "aws_session_token"]:
            if k in tap_config:
                key = f"TAP_{tap_name}_{k}".replace("-", "_").upper()
                os.environ[key] = tap_config[k]
                tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"'
                hide_cmd = True
        _, _ = runnow.run(f"docker pull {img}")
        _, output_text = runnow.run(
            f"docker run --rm -i "
            f"-v {cdw}:/home/local {tap_docker_args} "
            f"{img} --config {config.dockerize_cli_args(config_file)} --discover",
            echo=False,
            capture_stderr=False,
            hide=hide_cmd,
        )
        if not _is_valid_json(output_text):
            raise RuntimeError(f"Could not parse json file from output:\n{output_text}")
        uio.create_text_file(catalog_file, output_text)
    else:
        runnow.run(
            f"{tap_exe} --config {config_file} --discover > {catalog_file}",
            hide=hide_cmd,
        )
Exemple #6
0
def _smart_split(dockerfile_path, image_name, addl_args=None):
    """
    Returns list of tuples: [
        (partial_image_name, partial_dockerfile_text)
        (derived_image_name, derived_dockerfile_text)
    ]
    Create two dockerfiles from a single file.
    1. The first 'core' image will contain all statements until the first COPY or ADD.
    2. The second 'derived' image will pull from 'core' and complete the build using
    local files or artifacts required by ADD or COPY commands.
    """
    orig_text = uio.get_text_file_contents(dockerfile_path)
    addl_args = addl_args or ""
    core_dockerfile = ""
    derived_dockerfile = ""
    requires_context = False  # Whether we need file context to determine output
    for line in orig_text.split("\n"):
        if any([line.startswith("COPY"), line.startswith("ADD")]):
            requires_context = True
        if not requires_context:
            core_dockerfile += line + "\n"
        else:
            derived_dockerfile += line + "\n"
    core_md5 = hashlib.md5(
        (addl_args + core_dockerfile).encode("utf-8")).hexdigest()
    full_md5 = hashlib.md5((addl_args + orig_text).encode("utf-8")).hexdigest()
    core_image_name = f"{image_name}:core-md5-{core_md5}"
    derived_image_name = f"{image_name}:md5-{full_md5}"

    core_dockerfile = (
        f"# NO NOT EDIT - file is generated automatically from `Dockerfile`\n\n"
        f"# Dockerfile.core - will be created and pushed as:\n"
        f"# \t{core_image_name}\n\n{core_dockerfile}")
    if derived_dockerfile:
        derived_dockerfile = (
            f"# NO NOT EDIT - file is generated automatically from `Dockerfile`\n\n"
            f"FROM {core_image_name}\n\n{derived_dockerfile}")
    else:
        derived_dockerfile = None  # No additional work to do.
    return [(core_image_name, core_dockerfile),
            (derived_image_name, derived_dockerfile)]
Exemple #7
0
def _create_selected_catalog(
    tap_name: str,
    plan_file: str,
    raw_catalog_file: str,
    output_file: str,
    replication_strategy: str,
    skip_senseless_validators: bool,
) -> None:
    taps_dir = config.get_taps_dir()
    catalog_dir = config.get_tap_output_dir(tap_name, taps_dir)
    output_file = output_file or os.path.join(catalog_dir, "selected-catalog.json")
    catalog_full = json.loads(Path(raw_catalog_file).read_text())
    plan_file = plan_file or config.get_plan_file(tap_name)
    plan = yaml.safe_load(uio.get_text_file_contents(plan_file))
    if ("selected_tables" not in plan) or (plan["selected_tables"] is None):
        raise ValueError(f"No selected tables found in plan file '{plan_file}'.")
    included_table_objects = []
    for tbl in sorted(catalog_full["streams"], key=lambda x: _get_stream_name(x)):
        stream_name = _get_stream_name(tbl)
        stream_id = _get_stream_id(tbl)
        if stream_name in plan["selected_tables"].keys() and stream_id == plan[
            "selected_tables"
        ][stream_name].get("stream_id", stream_name):
            _set_catalog_file_keys(tbl, plan["selected_tables"][stream_name])
            _select_table(tbl, replication_strategy=replication_strategy)
            for col_name in _get_catalog_table_columns(tbl):
                col_selected = col_name in (
                    (plan["selected_tables"][stream_name]["selected_columns"] or [])
                    + (plan["selected_tables"][stream_name]["replication_key"] or [])
                    + (plan["selected_tables"][stream_name]["primary_key"] or [])
                )
                _select_table_column(tbl, col_name, col_selected)
            if skip_senseless_validators:
                _remove_senseless_validators(tbl)
            included_table_objects.append(tbl)
    catalog_new = {"streams": included_table_objects}
    with open(output_file, "w") as f:
        json.dump(catalog_new, f, indent=2)
Exemple #8
0
def make_aggregate_state_file(raw_json_lines_file: str,
                              output_json_file: str) -> None:
    """
    Create a valid json state file from one or more json lines ('jsonl' format).

    Parameters
    ----------
    raw_json_lines_file : str
        Path to a jsonl (json lines) file containing one or more json documents to
        aggregate.
    output_json_file : str
        Path to use when saving the aggregated json file.
    """
    try:
        uio.create_text_file(
            output_json_file,
            get_aggregate_state(
                uio.get_text_file_contents(raw_json_lines_file)),
        )
    except ValueError as ex:
        raise ValueError(
            f"State file from '{raw_json_lines_file}' is not valid JSON or JSONL. "
            f"Please either delete or fix the file and then retry. {ex}")
Exemple #9
0
def _sync_one_table(
    tap_name: str,
    table_name: str,
    taps_dir: str,
    config_file: str,
    target_name: str,
    target_config_file: str,
    table_catalog_file: str,
    table_state_file: str,
    log_dir: str,
    dockerized: bool,
    tap_exe: str,
    target_exe: str,
) -> None:
    if not tap_exe:
        tap_exe = f"tap-{tap_name}"
    pipeline_version_num = config.get_pipeline_version_number()
    table_state_file = config.replace_placeholders(
        {"table_state_file": table_state_file},
        tap_name,
        table_name,
        pipeline_version_num,
    )["table_state_file"]
    tap_args = f"--config {config_file} --catalog {table_catalog_file} "
    if uio.file_exists(table_state_file):
        local_state_file_in = os.path.join(
            config.get_tap_output_dir(tap_name, taps_dir),
            f"{tap_name}-{table_name}-state.json",
        )
        if not uio.get_text_file_contents(table_state_file):
            logging.warning(
                f"Ignoring blank state file from '{table_state_file}'.")
        else:
            states.make_aggregate_state_file(table_state_file,
                                             local_state_file_in)
            tap_args += f" --state {local_state_file_in}"
        local_state_file_out = (
            f"{'.'.join(local_state_file_in.split('.')[:-1])}-new.json")
    else:
        local_state_file_out = os.path.join(
            config.get_tap_output_dir(tap_name, taps_dir),
            f"{tap_name}-{table_name}-state-new.json",
        )

    tmp_target_config = config.get_single_table_target_config_file(
        target_name,
        target_config_file,
        tap_name=tap_name,
        table_name=table_name,
        pipeline_version_num=pipeline_version_num,
    )
    target_args = f"--config {tmp_target_config} "
    hide_cmd = False
    if dockerized:
        cdw = os.getcwd().replace("\\", "/")
        tap_image_name = docker._get_docker_tap_image(tap_exe)
        target_image_name = docker._get_docker_tap_image(target_exe=target_exe)
        _, _ = runnow.run(f"docker pull {tap_image_name}")
        _, _ = runnow.run(f"docker pull {target_image_name}")

        tap_config = json.loads(uio.get_text_file_contents(config_file))
        target_config = json.loads(
            uio.get_text_file_contents(target_config_file))
        tap_docker_args = ""
        target_docker_args = ""
        # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var:
        for k in [
                "aws_access_key_id", "aws_secret_access_key",
                "aws_session_token"
        ]:
            if k in tap_config:
                key = f"TAP_{tap_name}_{k}".replace("-", "_").upper()
                os.environ[key] = tap_config[k]
                tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"'
                hide_cmd = True
            if k in target_config:
                key = f"TARGET_{target_name}_{k}".replace("-", "_").upper()
                os.environ[key] = target_config[k]
                target_docker_args += f' -e {k.upper()}="{target_config[k]}"'
                hide_cmd = True
        sync_cmd = (
            f"docker run --rm -i -v {cdw}:/home/local {tap_docker_args} {tap_image_name} "
            f"{config.dockerize_cli_args(tap_args)} "
            "| "
            f"docker run --rm -i -v {cdw}:/home/local {target_docker_args} {target_image_name} "
            f"{config.dockerize_cli_args(target_args)} "
            ">> "
            f"{local_state_file_out}")
    else:
        sync_cmd = (f"{tap_exe} "
                    f"{tap_args} "
                    "| "
                    f"{target_exe} "
                    f"{target_args} "
                    "> "
                    f"{local_state_file_out}")
    runnow.run(sync_cmd, hide=hide_cmd)
    if not uio.file_exists(local_state_file_out):
        logging.warning(
            f"State file does not exist at path '{local_state_file_out}'. Skipping upload. "
            f"This can be caused by having no data, or no new data, in the source table."
        )
    else:
        uio.upload_file(local_state_file_out, table_state_file)
Exemple #10
0
def update_module_docs(
    tf_dir: str,
    *,
    recursive: bool = True,
    readme: str = "README.md",
    footer: bool = True,
    header: bool = True,
    special_case_words: List[str] = None,
    extra_docs_names: List[str] = ["USAGE.md", "NOTES.md"],
    git_repo: str = "https://github.com/slalom-ggp/dataops-infra",
):
    """
    Replace all README.md files with auto-generated documentation, a wrapper
    around the `terraform-docs` tool.

    Parameters:
    ----------
    tf_dir: Directory of terraform scripts to document.
    recursive : Optional (default=True). 'True' to run on all subdirectories, recursively.
    readme : Optional (default="README.md"). The filename to create when generating docs.
    footnote: Optional (default=True). 'True' to include the standard footnote.
    special_case_words: Optional. A list of words to override special casing rules.
    extra_docs_names: (Optional.) A list of filenames which, if found, will be appended
      to each module's README.md file.
    git_repo: Optional. The git repo path to use in rendering 'source' paths.

    Returns:
    -------
    None
    """
    markdown_text = ""
    if ".git" not in tf_dir and ".terraform" not in tf_dir:
        tf_files = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if x.endswith(".tf")
        ]
        extra_docs = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if extra_docs_names and os.path.basename(x) in extra_docs_names
        ]
        if tf_files:
            module_title = _proper(os.path.basename(tf_dir),
                                   special_case_words=special_case_words)
            parent_dir_name = os.path.basename(Path(tf_dir).parent)
            if parent_dir_name != ".":
                module_title = _proper(
                    f"{parent_dir_name} {module_title}",
                    special_case_words=special_case_words,
                )
            module_path = tf_dir.replace(".",
                                         "").replace("//",
                                                     "/").replace("\\", "/")
            _, markdown_output = runnow.run(
                f"terraform-docs md --no-providers --sort-by-required {tf_dir}",
                echo=False,
            )
            if header:
                markdown_text += DOCS_HEADER.format(module_title=module_title,
                                                    module_path=module_path)
            markdown_text += markdown_output
            for extra_file in extra_docs:
                markdown_text += uio.get_text_file_contents(extra_file) + "\n"
            if footer:
                markdown_text += DOCS_FOOTER.format(src="\n".join([
                    "* [{file}]({repo}/tree/master/{dir}/{file})".format(
                        repo=git_repo,
                        dir=module_path,
                        file=os.path.basename(tf_file),
                    ) for tf_file in tf_files
                ]))
            uio.create_text_file(f"{tf_dir}/{readme}", markdown_text)
    if recursive:
        for folder in uio.list_local_files(tf_dir, recursive=False):
            if os.path.isdir(folder):
                update_module_docs(folder, recursive=recursive, readme=readme)
Exemple #11
0
def update_module_docs(
    tf_dir: str,
    *,
    recursive: bool = True,
    readme: str = "README.md",
    footer: bool = True,
    header: bool = True,
    special_case_words: List[str] = None,
    extra_docs_names: List[str] = ["USAGE.md", "NOTES.md"],
    git_repo: str = "https://github.com/slalom-ggp/dataops-infra",
) -> None:
    """
    Replace all README.md files with auto-generated documentation

    This is a wrapper around the `terraform-docs` tool.

    Parameters
    ----------
    tf_dir : str
        Directory of terraform scripts to document.
    recursive : bool, optional
        Run on all subdirectories, recursively. By default True.
    readme : str, optional
        The filename to create when generating docs, by default "README.md".
    footer : bool, optional
        Include the standard footnote, by default True.
    header : bool, optional
        Include the standard footnote, by default True.
    special_case_words : List[str], optional
        A list of words to override special casing rules, by default None.
    extra_docs_names : List[str], optional
        A list of filenames which, if found, will be appended to each
        module's README.md file, by default ["USAGE.md", "NOTES.md"].
    git_repo : str, optional
        The git repo path to use in rendering 'source' paths, by
        default "https://github.com/slalom-ggp/dataops-infra".
    """
    markdown_text = ""
    if ".git" not in tf_dir and ".terraform" not in tf_dir:
        tf_files = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if x.endswith(".tf")
        ]
        extra_docs = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if extra_docs_names and os.path.basename(x) in extra_docs_names
        ]
        if tf_files:
            module_title = _proper(os.path.basename(tf_dir),
                                   special_case_words=special_case_words)
            parent_dir_name = os.path.basename(Path(tf_dir).parent)
            # parent_title = _proper(
            #     parent_dir_name, special_case_words=special_case_words,
            # )
            module_title = _proper(
                f"{parent_dir_name} {module_title}",
                special_case_words=special_case_words,
            )
            module_path = tf_dir.replace(".",
                                         "").replace("//",
                                                     "/").replace("\\", "/")
            _, markdown_output = runnow.run(
                f"terraform-docs markdown document --sort=false {tf_dir}",
                # " --no-requirements"
                echo=False,
            )
            if "components" in module_path.lower():
                module_type = "Components"
            elif "catalog" in module_path.lower():
                module_type = "Catalog"
            else:
                module_type = "Other"
            if header:
                markdown_text += DOCS_HEADER.format(
                    module_title=module_title,
                    module_path=module_path,
                    module_type=module_type,
                )
            markdown_text += markdown_output
            for extra_file in extra_docs:
                markdown_text += uio.get_text_file_contents(extra_file) + "\n"
            if footer:
                markdown_text += DOCS_FOOTER.format(src="\n".join([
                    "* [{file}]({repo}/tree/main/{dir}/{file})".format(
                        repo=git_repo,
                        dir=module_path,
                        file=os.path.basename(tf_file),
                    ) for tf_file in tf_files
                ]))
            uio.create_text_file(f"{tf_dir}/{readme}", markdown_text)
    if recursive:
        for folder in uio.list_local_files(tf_dir, recursive=False):
            if os.path.isdir(folder):
                update_module_docs(folder, recursive=recursive, readme=readme)
Exemple #12
0
def _infer_schema(
    tap_name: str,
    taps_dir: str,
    raw_catalog_file: str,
    selected_catalog_file: str,
    *,
    config_file: str,
    catalog_dir: str,
    dockerized: bool,
    tap_exe: str,
) -> str:
    custom_catalog = json.loads(uio.get_text_file_contents(raw_catalog_file))
    tmp_folder = f"{catalog_dir}/tmp"
    tmp_outfile = f"{catalog_dir}/tmp/sync-dryrun.jsonl"
    uio.create_folder(catalog_dir)
    uio.create_folder(tmp_folder)
    logging.info(f"Cleaning up old files in tmp folder '{tmp_folder}'...")
    for file in uio.list_files(tmp_folder):
        if any(
            [
                file.endswith(x)
                for x in ["-config.json", "-dryrun.jsonl", "-table.inferred.json"]
            ]
        ):
            uio.delete_file(file)
    img = f"{docker.BASE_DOCKER_REPO}:{tap_exe}"
    hide_cmd = False
    if dockerized:
        cdw = os.getcwd().replace("\\", "/")
        tap_config = json.loads(uio.get_text_file_contents(config_file))
        tap_docker_args = ""
        # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var:
        for k in ["aws_access_key_id", "aws_secret_access_key", "aws_session_token"]:
            if k in tap_config:
                key = f"TAP_{tap_name}_{k}".replace("-", "_").upper()
                os.environ[key] = tap_config[k]
                tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"'
                hide_cmd = True
        _, _ = runnow.run(f"docker pull {img}")
        _, jsonl_out = runnow.run(
            f"docker run --rm -i "
            f"-v {cdw}:/home/local {tap_docker_args} "
            f"{img} "
            f"--config {config.dockerize_cli_args(config_file)}"
            f"--catalog {selected_catalog_file}",
            hide=hide_cmd,
            echo=False,
            capture_stderr=False,
        )
    else:
        _, jsonl_out = runnow.run(
            f"{tap_exe} "
            f"--config {config_file} "
            f"--catalog {selected_catalog_file}",
            hide=hide_cmd,
            echo=False,
            capture_stderr=False,
        )
    uio.create_text_file(tmp_outfile, jsonl_out)
    _, jsonl_out = runnow.run(
        f"cat {tmp_outfile} | singer-infer-schema --out-dir {tmp_folder}",
    )
    for file in uio.list_files(tmp_folder):
        if file.endswith(".inferred.json"):
            logging.info(f"Parsing inferred schema from '{file}'...")
            inferred_schema = json.loads(uio.get_text_file_contents(file))
            stream_name = file.split("/")[-1].split(".")[0]
            stream = (
                [x for x in custom_catalog["streams"] if x["stream"] == stream_name]
                or [None]
            )[0]
            if not stream:
                raise ValueError(
                    f"Failed to append inferred schema for stream name '{stream_name}'."
                    f" Stream not present in catalog file {selected_catalog_file}."
                )
            stream["schema"] = inferred_schema
    custom_catalog_file = config.get_custom_catalog_file(taps_dir, tap_name)
    uio.create_text_file(custom_catalog_file, json.dumps(custom_catalog, indent=2))
    return custom_catalog_file
Exemple #13
0
def _check_rules(
    tap_name: str,
    catalog_file: str,
    rules_file: str,
    replication_strategy: str,
    plan_file_out: str,
    selected_catalog_file_out: str,
    log_dir: Optional[str],
) -> None:
    """
    Create plan file and selected catalog file from provided rules and raw catalog.

    Parameters
    ----------
    catalog_file : str
        Path to a catalog file.
    rules_file : str
        Path to a rules file.
    plan_file_out : str
        Path to save the plan file.
    selected_catalog_file_out : str
        Path to save the selected catalog file.
    """
    select_rules = [
        line.split("#")[0].rstrip()
        for line in uio.get_text_file_contents(rules_file).splitlines()
        if line.split("#")[0].rstrip()
    ]
    matches: Dict[str, dict] = {}
    excluded_table_stream_ids: Dict[str, List[str]] = {}
    matched_stream_ids: Dict[str, str] = {}
    for stream_id, table_object in _get_catalog_tables_dict(catalog_file).items():
        table_name = _get_stream_name(table_object)
        if _table_match_check(
            table_name=table_name, stream_id=stream_id, select_rules=select_rules,
        ):
            if table_name in matched_stream_ids:
                raise RuntimeError(
                    f"Table name '{table_name}' matched multiple stream IDs: "
                    f'"{matched_stream_ids[table_name]}" and "{stream_id}". '
                    "This is most often caused by tables with the same name under "
                    "different source database schemas. Please qualify or disqualify "
                    "specific stream name patterns by using double-quoted stream IDs "
                    "in your rules file instead of or in addition to bare table names."
                )
            matched_stream_ids[table_name] = stream_id
            matches[table_name] = {}
            for col_object in _get_catalog_table_columns(table_object):
                col_name = col_object
                matches[table_name][col_name] = _col_match_check(
                    table_name, stream_id, col_name, select_rules
                )
        else:
            if table_name in excluded_table_stream_ids:
                excluded_table_stream_ids[table_name].append(stream_id)
            else:
                excluded_table_stream_ids[table_name] = [stream_id]
    all_matches_lower = [m.lower() for m in matches.keys()] + [
        f'"{m.lower()}"' for m in matched_stream_ids.values()
    ]
    declared_tables = set(
        [
            rule.split(".")[0].rstrip().lstrip("!")
            for rule in select_rules
            if rule.split(".")[0].rstrip() and ("*" not in rule.split(".")[0])
        ]
    )
    for required_table in declared_tables:
        if required_table.lower() not in all_matches_lower:
            logging.warning(
                f"The table '{required_table}' was declared in the rules file "
                "but could not be found in the catalog."
            )
    for match, match_cols in matches.items():
        if not match_cols:
            logging.warning(
                f"The table '{match}' was declared in the rules file "
                "but did not match with any columns in the catalog."
            )
    primary_keys, replication_keys = _get_table_keys(
        matches, matched_stream_ids, catalog_file, rules_file
    )
    file_text = _make_plan_file_text(
        matches,
        primary_keys,
        replication_keys,
        matched_stream_ids,
        excluded_table_stream_ids,
    )
    logging.info(f"Updating plan file: {plan_file_out}")
    uio.create_text_file(plan_file_out, file_text)
    config.push_logs(log_dir, [rules_file, plan_file_out])
    _create_selected_catalog(
        tap_name,
        plan_file=plan_file_out,
        raw_catalog_file=catalog_file,
        output_file=selected_catalog_file_out,
        replication_strategy=replication_strategy,
        skip_senseless_validators=SKIP_SENSELESS_VALIDATORS,
    )
    config.push_logs(log_dir, [selected_catalog_file_out])
Exemple #14
0
def parse_from_file(filepath: str, /) -> List[Dict[str, List]]:
    return parse_from_string(uio.get_text_file_contents(filepath))