Ejemplo n.º 1
0
def build_index(index_type: str, tf_dir: str, output_file: str,
                overview_desc: str):
    """
    Update the documentation index.

    Parameters:
    ----------
    index_type (str): Either 'Components' or 'Catalog'.
    tf_dir (str): The directory to scan for modules.
    output_file (str): The location of the documentation file to create.
    overview_dsc (str): The overview that will appear at the top of the index.
    """
    content_metadata = {
        "aws": "",
        # "azure": "",
        # "gcp": "",
    }
    git_url_pattern = "git::{git_repo}/{path}?ref={branch}"
    git_repo = "https://github.com/slalom-ggp/dataops-infra"
    git_branch = "main"

    toc_str = ""
    for platform_i, platform in enumerate(content_metadata.keys(), start=1):
        toc_str += (f"{platform_i}. [{_proper(platform)} {index_type}]"
                    f"(#{platform.lower()}-{index_type.lower()})\n")

        logging.info(f"Exploring platform '{platform}'")
        catalog_modules = get_tf_metadata(f"{tf_dir}/{platform}",
                                          recursive=True)
        for module, metadata in catalog_modules.items():
            module_title = f"{_proper(platform)} {_proper(os.path.basename(module))}"
            toc_str += (
                f"    - [{module_title}](#{module_title.replace(' ', '-').lower()})\n"
            )
            logging.debug(f"Exploring module '{module}': {metadata}")
            readme_path = f"{module}/README.md"
            content_metadata[platform] += (
                f"### {module_title}\n\n"
                f"#### Overview\n\n"
                f"{metadata['header']}\n\n"
                f"#### Documentation\n\n"
                f"- [{module_title} Readme]({readme_path})\n\n"
                f"-------------------\n\n")
    content = CATALOG_TEMPLATE.format(
        toc=toc_str,
        aws=content_metadata["aws"],
        azure="_(Coming soon)_",
        gcp="_(Coming soon)_",
        index_type=index_type,
        overview=overview_desc,
    )
    uio.create_text_file(output_file, content)
Ejemplo n.º 2
0
def change_upstream_source(
    dir_to_update=".",
    git_repo="https://github.com/slalom-ggp/dataops-infra",
    branch="master",
    relative_path="../../dataops-infra",
    to_relative=False,
    to_git=False,
    dry_run=False,
):
    """Change Terraform source"""
    if to_relative and to_git or not (to_relative or to_git):
        raise ValueError(
            "Must specify `--to_git` or `--to_relative`, but not both.")
    for tf_file in uio.list_local_files(dir_to_update, recursive=False):
        if tf_file.endswith(".tf"):
            # print(tf_file)
            new_lines = []
            for line in uio.get_text_file_contents(tf_file).splitlines():
                new_line = line
                if line.lstrip().startswith("source "):
                    current_path = line.lstrip().split('"')[1]
                    start_pos = max([
                        current_path.find("catalog/"),
                        current_path.find("components/")
                    ])
                    if start_pos > 0:
                        module_path = current_path[start_pos:].split(
                            "?ref=")[0]
                        if to_relative:
                            local_patten = "{relative_path}/{path}"
                            new_path = local_patten.format(
                                relative_path=relative_path, path=module_path)
                        elif to_git:
                            git_pattern = "git::{git_repo}//{path}?ref={branch}"
                            new_path = git_pattern.format(git_repo=git_repo,
                                                          path=module_path,
                                                          branch=branch)
                        if current_path == new_path:
                            print(f"{current_path} \n\t\t\t-> (unchanged)")
                        else:
                            print(f"{current_path} \n\t\t\t-> {new_path}")
                        new_line = f'  source = "{new_path}"'
                new_lines.append(new_line)
            new_file_text = "\n".join(new_lines)
            if dry_run:
                print(f"\n\n------------\n-- {tf_file}\n------------")
                print(new_file_text)
            else:
                uio.create_text_file(tf_file, new_file_text)
    if not dry_run:
        runnow.run("terraform fmt -recursive", dir_to_update)
Ejemplo n.º 3
0
def _discover(
    tap_name: str,
    taps_dir: str,
    *,
    config_file: str,
    catalog_dir: str,
    dockerized: bool,
    tap_exe: str,
) -> None:
    catalog_file = config.get_raw_catalog_file(
        taps_dir, catalog_dir, tap_name, allow_custom=False
    )
    uio.create_folder(catalog_dir)
    img = f"{docker.BASE_DOCKER_REPO}:{tap_exe}"
    hide_cmd = False
    if dockerized:
        cdw = os.getcwd().replace("\\", "/")
        tap_config = json.loads(uio.get_text_file_contents(config_file))
        tap_docker_args = ""
        # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var:
        for k in ["aws_access_key_id", "aws_secret_access_key", "aws_session_token"]:
            if k in tap_config:
                key = f"TAP_{tap_name}_{k}".replace("-", "_").upper()
                os.environ[key] = tap_config[k]
                tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"'
                hide_cmd = True
        _, _ = runnow.run(f"docker pull {img}")
        _, output_text = runnow.run(
            f"docker run --rm -i "
            f"-v {cdw}:/home/local {tap_docker_args} "
            f"{img} --config {config.dockerize_cli_args(config_file)} --discover",
            echo=False,
            capture_stderr=False,
            hide=hide_cmd,
        )
        if not _is_valid_json(output_text):
            raise RuntimeError(f"Could not parse json file from output:\n{output_text}")
        uio.create_text_file(catalog_file, output_text)
    else:
        runnow.run(
            f"{tap_exe} --config {config_file} --discover > {catalog_file}",
            hide=hide_cmd,
        )
Ejemplo n.º 4
0
def smart_split(dockerfile_path: str, tag_as, addl_args=None):
    tag_as = _to_list(tag_as)
    if tag_as:
        interim_image_name = tag_as[0].split(":")[0]
    else:
        interim_image_name = "untitled_image"
    (image_core,
     dockerfile_core), (image_derived,
                        dockerfile_derived) = _smart_split(dockerfile_path,
                                                           interim_image_name,
                                                           addl_args=addl_args)
    dockerfile_path_core = os.path.realpath(f"{dockerfile_path}.core")
    dockerfile_path_derived = os.path.realpath(f"{dockerfile_path}.quick")
    uio.create_text_file(filepath=dockerfile_path_core,
                         contents=dockerfile_core)
    if dockerfile_derived:
        uio.create_text_file(filepath=dockerfile_path_derived,
                             contents=dockerfile_derived)
    else:
        uio.delete_file(dockerfile_path_derived, ignore_missing=True)
        dockerfile_path_derived = None
    return image_core, dockerfile_path_core, image_derived, dockerfile_path_derived
Ejemplo n.º 5
0
def make_aggregate_state_file(raw_json_lines_file: str,
                              output_json_file: str) -> None:
    """
    Create a valid json state file from one or more json lines ('jsonl' format).

    Parameters
    ----------
    raw_json_lines_file : str
        Path to a jsonl (json lines) file containing one or more json documents to
        aggregate.
    output_json_file : str
        Path to use when saving the aggregated json file.
    """
    try:
        uio.create_text_file(
            output_json_file,
            get_aggregate_state(
                uio.get_text_file_contents(raw_json_lines_file)),
        )
    except ValueError as ex:
        raise ValueError(
            f"State file from '{raw_json_lines_file}' is not valid JSON or JSONL. "
            f"Please either delete or fix the file and then retry. {ex}")
Ejemplo n.º 6
0
def get_tf_metadata(tf_dir: str,
                    recursive: bool = False,
                    save_to_dir: bool = True):
    """
    Return a dictionary of Terraform module paths to JSON metadata about each module,
    a wrapper around the `terraform-docs` tool.

    Parameters:
    ----------
    tf_dir: Directory of terraform scripts to scan.
    recursive : Optional (default=True). 'True' to run on all subdirectories, recursively.

    Returns:
    -------
    dict
    """
    result = {}
    if (".git" not in tf_dir and ".terraform" not in tf_dir
            and "samples" not in tf_dir and "tests" not in tf_dir):
        if [
                x for x in uio.list_local_files(tf_dir, recursive=False)
                if x.endswith(".tf")
        ]:
            _, json_text = runnow.run(f"terraform-docs json {tf_dir}",
                                      echo=False)
            result[tf_dir] = json.loads(json_text)
            if save_to_dir:
                uio.create_folder(f"{tf_dir}/.terraform")
                uio.create_text_file(
                    f"{tf_dir}/.terraform/terraform-docs.json", json_text)
    if recursive:
        for folder in uio.list_local_files(tf_dir, recursive=False):
            folder = folder.replace("\\", "/")
            if os.path.isdir(folder):
                result.update(get_tf_metadata(folder, recursive=recursive))
    return result
Ejemplo n.º 7
0
def _update_var_output(output_var):
    return_code, output = runnow.run(f"terraform output {output_var}",
                                     echo=False)
    uio.create_text_file(os.path.join("outputs", output_var), contents=output)
    return True
Ejemplo n.º 8
0
def update_module_docs(
    tf_dir: str,
    *,
    recursive: bool = True,
    readme: str = "README.md",
    footer: bool = True,
    header: bool = True,
    special_case_words: List[str] = None,
    extra_docs_names: List[str] = ["USAGE.md", "NOTES.md"],
    git_repo: str = "https://github.com/slalom-ggp/dataops-infra",
):
    """
    Replace all README.md files with auto-generated documentation, a wrapper
    around the `terraform-docs` tool.

    Parameters:
    ----------
    tf_dir: Directory of terraform scripts to document.
    recursive : Optional (default=True). 'True' to run on all subdirectories, recursively.
    readme : Optional (default="README.md"). The filename to create when generating docs.
    footnote: Optional (default=True). 'True' to include the standard footnote.
    special_case_words: Optional. A list of words to override special casing rules.
    extra_docs_names: (Optional.) A list of filenames which, if found, will be appended
      to each module's README.md file.
    git_repo: Optional. The git repo path to use in rendering 'source' paths.

    Returns:
    -------
    None
    """
    markdown_text = ""
    if ".git" not in tf_dir and ".terraform" not in tf_dir:
        tf_files = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if x.endswith(".tf")
        ]
        extra_docs = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if extra_docs_names and os.path.basename(x) in extra_docs_names
        ]
        if tf_files:
            module_title = _proper(os.path.basename(tf_dir),
                                   special_case_words=special_case_words)
            parent_dir_name = os.path.basename(Path(tf_dir).parent)
            if parent_dir_name != ".":
                module_title = _proper(
                    f"{parent_dir_name} {module_title}",
                    special_case_words=special_case_words,
                )
            module_path = tf_dir.replace(".",
                                         "").replace("//",
                                                     "/").replace("\\", "/")
            _, markdown_output = runnow.run(
                f"terraform-docs md --no-providers --sort-by-required {tf_dir}",
                echo=False,
            )
            if header:
                markdown_text += DOCS_HEADER.format(module_title=module_title,
                                                    module_path=module_path)
            markdown_text += markdown_output
            for extra_file in extra_docs:
                markdown_text += uio.get_text_file_contents(extra_file) + "\n"
            if footer:
                markdown_text += DOCS_FOOTER.format(src="\n".join([
                    "* [{file}]({repo}/tree/master/{dir}/{file})".format(
                        repo=git_repo,
                        dir=module_path,
                        file=os.path.basename(tf_file),
                    ) for tf_file in tf_files
                ]))
            uio.create_text_file(f"{tf_dir}/{readme}", markdown_text)
    if recursive:
        for folder in uio.list_local_files(tf_dir, recursive=False):
            if os.path.isdir(folder):
                update_module_docs(folder, recursive=recursive, readme=readme)
Ejemplo n.º 9
0
def update_module_docs(
    tf_dir: str,
    *,
    recursive: bool = True,
    readme: str = "README.md",
    footer: bool = True,
    header: bool = True,
    special_case_words: List[str] = None,
    extra_docs_names: List[str] = ["USAGE.md", "NOTES.md"],
    git_repo: str = "https://github.com/slalom-ggp/dataops-infra",
) -> None:
    """
    Replace all README.md files with auto-generated documentation

    This is a wrapper around the `terraform-docs` tool.

    Parameters
    ----------
    tf_dir : str
        Directory of terraform scripts to document.
    recursive : bool, optional
        Run on all subdirectories, recursively. By default True.
    readme : str, optional
        The filename to create when generating docs, by default "README.md".
    footer : bool, optional
        Include the standard footnote, by default True.
    header : bool, optional
        Include the standard footnote, by default True.
    special_case_words : List[str], optional
        A list of words to override special casing rules, by default None.
    extra_docs_names : List[str], optional
        A list of filenames which, if found, will be appended to each
        module's README.md file, by default ["USAGE.md", "NOTES.md"].
    git_repo : str, optional
        The git repo path to use in rendering 'source' paths, by
        default "https://github.com/slalom-ggp/dataops-infra".
    """
    markdown_text = ""
    if ".git" not in tf_dir and ".terraform" not in tf_dir:
        tf_files = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if x.endswith(".tf")
        ]
        extra_docs = [
            x for x in uio.list_local_files(tf_dir, recursive=False)
            if extra_docs_names and os.path.basename(x) in extra_docs_names
        ]
        if tf_files:
            module_title = _proper(os.path.basename(tf_dir),
                                   special_case_words=special_case_words)
            parent_dir_name = os.path.basename(Path(tf_dir).parent)
            # parent_title = _proper(
            #     parent_dir_name, special_case_words=special_case_words,
            # )
            module_title = _proper(
                f"{parent_dir_name} {module_title}",
                special_case_words=special_case_words,
            )
            module_path = tf_dir.replace(".",
                                         "").replace("//",
                                                     "/").replace("\\", "/")
            _, markdown_output = runnow.run(
                f"terraform-docs markdown document --sort=false {tf_dir}",
                # " --no-requirements"
                echo=False,
            )
            if "components" in module_path.lower():
                module_type = "Components"
            elif "catalog" in module_path.lower():
                module_type = "Catalog"
            else:
                module_type = "Other"
            if header:
                markdown_text += DOCS_HEADER.format(
                    module_title=module_title,
                    module_path=module_path,
                    module_type=module_type,
                )
            markdown_text += markdown_output
            for extra_file in extra_docs:
                markdown_text += uio.get_text_file_contents(extra_file) + "\n"
            if footer:
                markdown_text += DOCS_FOOTER.format(src="\n".join([
                    "* [{file}]({repo}/tree/main/{dir}/{file})".format(
                        repo=git_repo,
                        dir=module_path,
                        file=os.path.basename(tf_file),
                    ) for tf_file in tf_files
                ]))
            uio.create_text_file(f"{tf_dir}/{readme}", markdown_text)
    if recursive:
        for folder in uio.list_local_files(tf_dir, recursive=False):
            if os.path.isdir(folder):
                update_module_docs(folder, recursive=recursive, readme=readme)
Ejemplo n.º 10
0
def _infer_schema(
    tap_name: str,
    taps_dir: str,
    raw_catalog_file: str,
    selected_catalog_file: str,
    *,
    config_file: str,
    catalog_dir: str,
    dockerized: bool,
    tap_exe: str,
) -> str:
    custom_catalog = json.loads(uio.get_text_file_contents(raw_catalog_file))
    tmp_folder = f"{catalog_dir}/tmp"
    tmp_outfile = f"{catalog_dir}/tmp/sync-dryrun.jsonl"
    uio.create_folder(catalog_dir)
    uio.create_folder(tmp_folder)
    logging.info(f"Cleaning up old files in tmp folder '{tmp_folder}'...")
    for file in uio.list_files(tmp_folder):
        if any(
            [
                file.endswith(x)
                for x in ["-config.json", "-dryrun.jsonl", "-table.inferred.json"]
            ]
        ):
            uio.delete_file(file)
    img = f"{docker.BASE_DOCKER_REPO}:{tap_exe}"
    hide_cmd = False
    if dockerized:
        cdw = os.getcwd().replace("\\", "/")
        tap_config = json.loads(uio.get_text_file_contents(config_file))
        tap_docker_args = ""
        # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var:
        for k in ["aws_access_key_id", "aws_secret_access_key", "aws_session_token"]:
            if k in tap_config:
                key = f"TAP_{tap_name}_{k}".replace("-", "_").upper()
                os.environ[key] = tap_config[k]
                tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"'
                hide_cmd = True
        _, _ = runnow.run(f"docker pull {img}")
        _, jsonl_out = runnow.run(
            f"docker run --rm -i "
            f"-v {cdw}:/home/local {tap_docker_args} "
            f"{img} "
            f"--config {config.dockerize_cli_args(config_file)}"
            f"--catalog {selected_catalog_file}",
            hide=hide_cmd,
            echo=False,
            capture_stderr=False,
        )
    else:
        _, jsonl_out = runnow.run(
            f"{tap_exe} "
            f"--config {config_file} "
            f"--catalog {selected_catalog_file}",
            hide=hide_cmd,
            echo=False,
            capture_stderr=False,
        )
    uio.create_text_file(tmp_outfile, jsonl_out)
    _, jsonl_out = runnow.run(
        f"cat {tmp_outfile} | singer-infer-schema --out-dir {tmp_folder}",
    )
    for file in uio.list_files(tmp_folder):
        if file.endswith(".inferred.json"):
            logging.info(f"Parsing inferred schema from '{file}'...")
            inferred_schema = json.loads(uio.get_text_file_contents(file))
            stream_name = file.split("/")[-1].split(".")[0]
            stream = (
                [x for x in custom_catalog["streams"] if x["stream"] == stream_name]
                or [None]
            )[0]
            if not stream:
                raise ValueError(
                    f"Failed to append inferred schema for stream name '{stream_name}'."
                    f" Stream not present in catalog file {selected_catalog_file}."
                )
            stream["schema"] = inferred_schema
    custom_catalog_file = config.get_custom_catalog_file(taps_dir, tap_name)
    uio.create_text_file(custom_catalog_file, json.dumps(custom_catalog, indent=2))
    return custom_catalog_file
Ejemplo n.º 11
0
def _check_rules(
    tap_name: str,
    catalog_file: str,
    rules_file: str,
    replication_strategy: str,
    plan_file_out: str,
    selected_catalog_file_out: str,
    log_dir: Optional[str],
) -> None:
    """
    Create plan file and selected catalog file from provided rules and raw catalog.

    Parameters
    ----------
    catalog_file : str
        Path to a catalog file.
    rules_file : str
        Path to a rules file.
    plan_file_out : str
        Path to save the plan file.
    selected_catalog_file_out : str
        Path to save the selected catalog file.
    """
    select_rules = [
        line.split("#")[0].rstrip()
        for line in uio.get_text_file_contents(rules_file).splitlines()
        if line.split("#")[0].rstrip()
    ]
    matches: Dict[str, dict] = {}
    excluded_table_stream_ids: Dict[str, List[str]] = {}
    matched_stream_ids: Dict[str, str] = {}
    for stream_id, table_object in _get_catalog_tables_dict(catalog_file).items():
        table_name = _get_stream_name(table_object)
        if _table_match_check(
            table_name=table_name, stream_id=stream_id, select_rules=select_rules,
        ):
            if table_name in matched_stream_ids:
                raise RuntimeError(
                    f"Table name '{table_name}' matched multiple stream IDs: "
                    f'"{matched_stream_ids[table_name]}" and "{stream_id}". '
                    "This is most often caused by tables with the same name under "
                    "different source database schemas. Please qualify or disqualify "
                    "specific stream name patterns by using double-quoted stream IDs "
                    "in your rules file instead of or in addition to bare table names."
                )
            matched_stream_ids[table_name] = stream_id
            matches[table_name] = {}
            for col_object in _get_catalog_table_columns(table_object):
                col_name = col_object
                matches[table_name][col_name] = _col_match_check(
                    table_name, stream_id, col_name, select_rules
                )
        else:
            if table_name in excluded_table_stream_ids:
                excluded_table_stream_ids[table_name].append(stream_id)
            else:
                excluded_table_stream_ids[table_name] = [stream_id]
    all_matches_lower = [m.lower() for m in matches.keys()] + [
        f'"{m.lower()}"' for m in matched_stream_ids.values()
    ]
    declared_tables = set(
        [
            rule.split(".")[0].rstrip().lstrip("!")
            for rule in select_rules
            if rule.split(".")[0].rstrip() and ("*" not in rule.split(".")[0])
        ]
    )
    for required_table in declared_tables:
        if required_table.lower() not in all_matches_lower:
            logging.warning(
                f"The table '{required_table}' was declared in the rules file "
                "but could not be found in the catalog."
            )
    for match, match_cols in matches.items():
        if not match_cols:
            logging.warning(
                f"The table '{match}' was declared in the rules file "
                "but did not match with any columns in the catalog."
            )
    primary_keys, replication_keys = _get_table_keys(
        matches, matched_stream_ids, catalog_file, rules_file
    )
    file_text = _make_plan_file_text(
        matches,
        primary_keys,
        replication_keys,
        matched_stream_ids,
        excluded_table_stream_ids,
    )
    logging.info(f"Updating plan file: {plan_file_out}")
    uio.create_text_file(plan_file_out, file_text)
    config.push_logs(log_dir, [rules_file, plan_file_out])
    _create_selected_catalog(
        tap_name,
        plan_file=plan_file_out,
        raw_catalog_file=catalog_file,
        output_file=selected_catalog_file_out,
        replication_strategy=replication_strategy,
        skip_senseless_validators=SKIP_SENSELESS_VALIDATORS,
    )
    config.push_logs(log_dir, [selected_catalog_file_out])