Ejemplo n.º 1
0
def install(plugin_name: str, source: str = None, alias: str = None):
    """
    Install the requested plugin to the local machine.

    Arguments:
        plugin_name {str} -- The name of the plugin to install, including the tap- or
        target- prefix.

    Keyword Arguments:
        source {str} -- Optional. Overrides the pip installation source.
        alias {str} -- Optional. Overrides the name (alias) of the plugin.

    Raises:
        RuntimeError: [description]
    """
    source = source or plugin_name
    alias = alias or plugin_name

    venv_dir = os.path.join(config.VENV_ROOT, alias)
    install_path = os.path.join(config.INSTALL_ROOT, alias)
    if uio.file_exists(install_path):
        response = input(f"The file '{install_path}' already exists. "
                         f"Are you sure you want to replace this file? [y/n]")
        if not response.lower() in ["y", "yes"]:
            raise RuntimeError(f"File already exists '{install_path}'.")
        uio.delete_file(install_path)
    runnow.run(f"python3 -m venv {venv_dir}")
    runnow.run(f"{os.path.join(venv_dir ,'bin', 'pip3')} install {source}")
    runnow.run(f"ln -s {venv_dir}/bin/{plugin_name} {install_path}")
Ejemplo n.º 2
0
def anonymize_file(filepath: str, hash_key: str, hash_function: str = "MD5"):
    """
    Hashes the first column of the provided CSV or Excel file.

    The output will be saved as a new anonymized version of the file.

    Usage Guidelines:

    1. File should be in Excel format, with a single sheet.
    2. The first column in the Excel sheet should contain the ID to anonymize.
    3. Currently supported hashing functions are: MD5, SHA256, and SHA512
    4. **NOTE:** Always open and review the file to confirm that the anonymization process
       was successful.

    Parameters
    ----------
    filepath : str
        The path to the file to be anonymized.
    hash_key : str
        A hash key to be used as a seed during anonymization.
    hash_function : str, optional
        The hashing function to use, by default "MD5" (most ubiquitous) will be used

    Raises
    ------
    ValueError
        If an incorrect hash_key is requested.
    FileExistsError
        If the file cannot be found.
    """
    if hash_function not in HASH_FUNCTIONS:
        raise ValueError(f"Unsupported hash function {hash_function}. "
                         f"Expected one of : {HASH_FUNCTIONS.keys()}")
    if hash_key.upper() in HASH_FUNCTIONS.keys():
        raise ValueError(
            f"A hash algorithm was attempted to be passed as the encryption function. "
            f"Please check the syntax for a missing encryption key and try again."
            f"Details: hash key cannot be one of : {HASH_FUNCTIONS.keys()}")
    if not uio.file_exists(filepath):
        raise FileExistsError(f"Could not find file {filepath}")

    def hash_fn(x):
        fn = HASH_FUNCTIONS[hash_function]
        return fn(f"{hash_key}{x}".encode("utf-8")).hexdigest()

    df = pandas.read_excel(filepath)
    df[df.columns[0]] = df[df.columns[0]].apply(hash_fn)

    new_filepath = (".".join(filepath.split(".")[:-1]) + "-anonymized." +
                    filepath.split(".")[-1])
    print(new_filepath)
    df.to_excel(new_filepath, index=False)
Ejemplo n.º 3
0
def save_spark_table(
    table_name,
    file_path,
    entity_type=None,
    force_single_file=False,
    compression="gzip",
    schema_only=True,
    overwrite=True,
):
    start_time = time.time()
    file_path = _verify_path(file_path)
    df = spark.sql(f"SELECT * FROM {table_name}")
    if uio.file_exists(os.path.join(file_path, "_SUCCESS")):
        if overwrite:
            logging.warning(
                "Saved table already exists and overwrite=True. Deleting older files."
            )
            for oldfile in uio.list_files(file_path):
                uio.delete_file(oldfile)
    if force_single_file:
        logging.debug(
            f"Saving spark table '{table_name}' to single file: '{file_path}'..."
        )
        df = df.coalesce(1)
    else:
        logging.debug(f"Saving spark table '{table_name}' to folder: '{file_path}'...")
    try:
        df.write.csv(  # SAFE
            file_path,
            mode="overwrite",
            header=True,
            compression=compression,
            quote='"',
            escape='"',
        )
    except Exception as ex:  # intermittent failures can be caused by eventual consistency
        logging.warning(
            f"Retrying S3 table save operation because the first attempt failed ({ex})"
        )
        time.sleep(20)  # Sleep to allow S3 to reach eventual consistency
        df.write.csv(  # SAFE
            file_path,
            mode="overwrite",
            header=True,
            compression=compression,
            quote='"',
            escape='"',
        )
Ejemplo n.º 4
0
def _get_plugins_list(
    plugins_index: Optional[str] = None, ) -> List[Tuple[str, str, str]]:
    plugins_index = plugins_index or SINGER_PLUGINS_INDEX
    if not uio.file_exists(plugins_index):
        raise RuntimeError(f"No file found at '{plugins_index}'."
                           "Please set SINGER_PLUGINS_INDEX and try again.")
    yml_doc = yaml.safe_load(uio.get_text_file_contents(plugins_index))
    taps = yml_doc["singer-taps"]
    list_of_tuples = []
    taps = yml_doc["singer-taps"]
    targets = yml_doc["singer-targets"]
    plugins = taps + targets
    for plugin in plugins:
        list_of_tuples.append((
            plugin["name"],
            plugin.get("source", None),
            plugin.get("alias", None),
        ))
    return list_of_tuples
Ejemplo n.º 5
0
def build_image(
    tap_or_plugin_alias: str,
    target_alias: str = None,
    push: bool = False,
    pre: bool = False,
    ignore_cache: bool = False,
) -> None:
    """Build a single image.

    If tap and target are both provided, any required upstream images will be built as well.

    Arguments:
        tap_or_plugin_alias {str} -- The name of the tap (without the `tap-` prefix).

    Keyword Arguments:
        target_alias {str} -- Optional. The name of the target (without the `target-` prefix).
        push {bool} -- True to push images to image repository after build. (default: {False})
        pre {bool} -- True to use and create prelease versions. (default: {False})
        ignore_cache {bool} -- True to build images without cached image layers. (default: {False})
    """
    has_custom_tap = uio.file_exists(f"./tap-{tap_or_plugin_alias}.Dockerfile")
    has_custom_target = uio.file_exists(f"./target-{target_alias}.Dockerfile")
    if has_custom_tap and has_custom_target:
        raise NotImplementedError(
            "Cannot combine a custom tap ('tap-{tap_or_plugin_alias}') "
            "with a custom target '{target_alias}'.")
    if has_custom_tap:
        logging.info(f"Using custom Dockerfile for tap-{tap_or_plugin_alias}")
        _build_plugin_image(
            f"tap-{tap_or_plugin_alias}",
            source=f"./tap-{tap_or_plugin_alias}.Dockerfile",
            alias=f"tap-{tap_or_plugin_alias}",
            push=push,
            pre=pre,
            ignore_cache=ignore_cache,
        )
    else:
        name, source, alias = _get_plugin_info(f"tap-{tap_or_plugin_alias}")
        if source and "Dockerfile" in source:
            has_custom_tap = True
        _build_plugin_image(
            name,
            source=source,
            alias=alias,
            push=push,
            pre=pre,
            ignore_cache=ignore_cache,
        )
    if target_alias:
        if has_custom_target:
            logging.info(f"Using custom Dockerfile for target-{target_alias}")
            _build_plugin_image(
                f"target-{target_alias}",
                source=f"./target-{target_alias}.Dockerfile",
                alias=f"target-{target_alias}",
                push=push,
                pre=pre,
                ignore_cache=ignore_cache,
            )
        else:
            name, source, alias = _get_plugin_info(f"target-{target_alias}")
            if "Dockerfile" in source:
                has_custom_target = True
            _build_plugin_image(
                name,
                source=source,
                alias=alias,
                push=push,
                pre=pre,
                ignore_cache=ignore_cache,
            )
        _build_composite_image(
            tap_alias=tap_or_plugin_alias,
            target_alias=target_alias,
            push=push,
            pre=pre,
            ignore_cache=ignore_cache,
            has_custom_tap=has_custom_tap,
            has_custom_target=has_custom_target,
        )
Ejemplo n.º 6
0
def _sync_one_table(
    tap_name: str,
    table_name: str,
    taps_dir: str,
    config_file: str,
    target_name: str,
    target_config_file: str,
    table_catalog_file: str,
    table_state_file: str,
    log_dir: str,
    dockerized: bool,
    tap_exe: str,
    target_exe: str,
) -> None:
    if not tap_exe:
        tap_exe = f"tap-{tap_name}"
    pipeline_version_num = config.get_pipeline_version_number()
    table_state_file = config.replace_placeholders(
        {"table_state_file": table_state_file},
        tap_name,
        table_name,
        pipeline_version_num,
    )["table_state_file"]
    tap_args = f"--config {config_file} --catalog {table_catalog_file} "
    if uio.file_exists(table_state_file):
        local_state_file_in = os.path.join(
            config.get_tap_output_dir(tap_name, taps_dir),
            f"{tap_name}-{table_name}-state.json",
        )
        if not uio.get_text_file_contents(table_state_file):
            logging.warning(
                f"Ignoring blank state file from '{table_state_file}'.")
        else:
            states.make_aggregate_state_file(table_state_file,
                                             local_state_file_in)
            tap_args += f" --state {local_state_file_in}"
        local_state_file_out = (
            f"{'.'.join(local_state_file_in.split('.')[:-1])}-new.json")
    else:
        local_state_file_out = os.path.join(
            config.get_tap_output_dir(tap_name, taps_dir),
            f"{tap_name}-{table_name}-state-new.json",
        )

    tmp_target_config = config.get_single_table_target_config_file(
        target_name,
        target_config_file,
        tap_name=tap_name,
        table_name=table_name,
        pipeline_version_num=pipeline_version_num,
    )
    target_args = f"--config {tmp_target_config} "
    hide_cmd = False
    if dockerized:
        cdw = os.getcwd().replace("\\", "/")
        tap_image_name = docker._get_docker_tap_image(tap_exe)
        target_image_name = docker._get_docker_tap_image(target_exe=target_exe)
        _, _ = runnow.run(f"docker pull {tap_image_name}")
        _, _ = runnow.run(f"docker pull {target_image_name}")

        tap_config = json.loads(uio.get_text_file_contents(config_file))
        target_config = json.loads(
            uio.get_text_file_contents(target_config_file))
        tap_docker_args = ""
        target_docker_args = ""
        # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var:
        for k in [
                "aws_access_key_id", "aws_secret_access_key",
                "aws_session_token"
        ]:
            if k in tap_config:
                key = f"TAP_{tap_name}_{k}".replace("-", "_").upper()
                os.environ[key] = tap_config[k]
                tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"'
                hide_cmd = True
            if k in target_config:
                key = f"TARGET_{target_name}_{k}".replace("-", "_").upper()
                os.environ[key] = target_config[k]
                target_docker_args += f' -e {k.upper()}="{target_config[k]}"'
                hide_cmd = True
        sync_cmd = (
            f"docker run --rm -i -v {cdw}:/home/local {tap_docker_args} {tap_image_name} "
            f"{config.dockerize_cli_args(tap_args)} "
            "| "
            f"docker run --rm -i -v {cdw}:/home/local {target_docker_args} {target_image_name} "
            f"{config.dockerize_cli_args(target_args)} "
            ">> "
            f"{local_state_file_out}")
    else:
        sync_cmd = (f"{tap_exe} "
                    f"{tap_args} "
                    "| "
                    f"{target_exe} "
                    f"{target_args} "
                    "> "
                    f"{local_state_file_out}")
    runnow.run(sync_cmd, hide=hide_cmd)
    if not uio.file_exists(local_state_file_out):
        logging.warning(
            f"State file does not exist at path '{local_state_file_out}'. Skipping upload. "
            f"This can be caused by having no data, or no new data, in the source table."
        )
    else:
        uio.upload_file(local_state_file_out, table_state_file)
Ejemplo n.º 7
0
def sync(
    tap_name: str,
    target_name: str = "csv",
    table_name: Optional[str] = None,
    taps_dir: Optional[str] = None,
    *,
    dockerized: Optional[bool] = None,
    rescan: bool = False,
    tap_exe: Optional[str] = None,
    target_exe: Optional[str] = None,
    config_dir: Optional[str] = None,
    config_file: Optional[str] = None,
    catalog_dir: Optional[str] = None,
    target_config_file: Optional[str] = None,
    state_file: Optional[str] = None,
    log_dir: Optional[str] = None,
    exclude_tables: Optional[List[str]] = None,
    replication_strategy: Optional[str] = None,
) -> None:
    """
    Synchronize data from tap to target.

    Parameters:
    ----------
    tap_name : {str}
        The name/alias of the source tap, without the `tap-` prefix.
    target_name : {str}
        The name/alias of the target, without the `tap-` prefix.
        (Default="csv")
    table_name : {str}
        The name of the table to sync. To sync multiple tables, specify
        a comma-separated list of tables surrounded by square brackets (e.g. "[tb1,tbl2]"),
        or use "*" or None to sync all tables.
        (Default=None)
    dockerized : {bool}
        True or False to force whether the command is run
        dockerized. If omitted, the best option will be selected automatically.
    rescan : {bool}
        True to force a rescan and replace existing metadata.
    tap_exe : {str}
        Overrides the tap executable, if different from `tap-{tap_name}`.
    target_exe : {str}
        Overrides the target executable, if different from `target-{tap_name}`.
    taps_dir : {str}
        The directory containing the rules file. (Default=cwd)
    config_dir : {str}
        The default location of config, catalog and other
        potentially sensitive information. (Recommended to be excluded from source control.)
        (Default="${taps_dir}/.secrets")
    config_file : {str}
        The location of the JSON config file which
        contains config for the specified tap or 'False' to only pull settings from
        environment variables. Default path is f"${config_dir}/${plugin_name}-config.json".
    catalog_dir : {str}
        The output directory to be used for saving catalog
        files. If not provided, a path will be generated automatically within `.output` or
        a path specified by the `TAP_SCRATCH_DIR` environment variable.
    target_config_file : {str}
        The location of the JSON config file which
        contains config for the specified target or 'False' to only pull settings from
        environment variables. Default path is f"${config_dir}/${plugin_name}-config.json".
    state_file : {str}
        The path to a state file. If not provided, a state
        file path will be generated automatically within `catalog_dir`.
    log_dir : {str}
        Optional. The location to publish logs and other artifacts. If omitted, no
        extra publishing will be performed.
    exclude_tables: {List(str)}
        A list of tables to exclude. Ignored
        if table_name arg is not "*".
    replication_strategy : {str}
        One of "FULL_TABLE", "INCREMENTAL", or "LOG_BASED"; by default "INCREMENTAL" or
        a value is set in the TAP_{TAPNAME}_REPLICATION_STRATEGY environment variable.
    """
    config.print_version()

    taps_dir = config.get_taps_dir(taps_dir)
    config_file, tap_settings = config.get_or_create_config(
        f"tap-{tap_name}",
        taps_dir=taps_dir,
        config_dir=config_dir,
        config_file=config_file,
    )
    target_config_file, target_settings = config.get_or_create_config(
        f"target-{target_name}",
        taps_dir=taps_dir,
        config_dir=config_dir,
        config_file=target_config_file,
    )
    tap_exe = tap_exe or tap_settings.get("EXE", f"tap-{tap_name}")
    target_exe = target_exe or target_settings.get("EXE",
                                                   f"target-{target_name}")
    replication_strategy = replication_strategy or tap_settings.get(
        "REPLICATION_STRATEGY", "INCREMENTAL")
    config.validate_replication_strategy(replication_strategy)

    table_name = table_name or tap_settings.get("TABLE_NAME", None)
    exclude_tables = exclude_tables or tap_settings.get("EXCLUDE_TABLES", None)
    rules_file = config.get_rules_file(taps_dir, tap_name)

    # TODO: Resolve bug in Windows STDERR inclusion when emitting catalog json from
    #       docker run
    # if dockerized is None:
    #     if uio.is_windows() or uio.is_mac():
    #         dockerized = True
    #         logging.info(
    #             "The 'dockerized' argument is not set when running either Windows or OSX..."
    #             "Defaulting to dockerized=True"
    #         )

    catalog_dir = catalog_dir or config.get_tap_output_dir(tap_name, taps_dir)
    log_dir = config.get_log_dir(log_dir)
    full_catalog_file = f"{catalog_dir}/{tap_name}-catalog-selected.json"
    if rescan or rules_file or not uio.file_exists(full_catalog_file):
        plans.plan(
            tap_name,
            dockerized=dockerized,
            rescan=rescan,
            tap_exe=tap_exe,
            taps_dir=taps_dir,
            config_file=config_file,
            config_dir=catalog_dir,
            log_dir=log_dir,
        )
    list_of_tables = plans.get_table_list(
        table_filter=table_name,
        exclude_tables=exclude_tables,
        catalog_file=full_catalog_file,
    )
    logging.info(f"Table(s) to sync: {', '.join(list_of_tables)}")
    for table in list_of_tables:
        # Call each tap independently so that table state files are kept separate:
        tmp_catalog_file = f"{catalog_dir}/{tap_name}-{table}-catalog.json"
        table_state_file = (state_file or config.get_state_file_path()
                            or f"{catalog_dir}/{table}-state.json")
        plans._create_single_table_catalog(
            tap_name=tap_name,
            table_name=table,
            full_catalog_file=full_catalog_file,
            output_file=tmp_catalog_file,
        )
        _sync_one_table(
            tap_name=tap_name,
            target_name=target_name,
            table_name=table,
            taps_dir=taps_dir,
            config_file=config_file,
            target_config_file=target_config_file,
            table_catalog_file=tmp_catalog_file,
            table_state_file=table_state_file,
            log_dir=log_dir,
            dockerized=dockerized,
            tap_exe=tap_exe,
            target_exe=target_exe,
        )
Ejemplo n.º 8
0
 def test_file_exists(self):
     assert uio.file_exists("slalom/__init__.py")
Ejemplo n.º 9
0
def plan(
    tap_name: str,
    *,
    dockerized: bool = None,
    rescan: bool = None,
    infer_custom_schema: bool = None,
    tap_exe: str = None,
    taps_dir: str = None,
    config_dir: str = None,
    config_file: str = None,
    log_dir: str = None,
    replication_strategy: str = None,
) -> None:
    """Perform all actions necessary to prepare (plan) for a tap execution.

    1. Capture raw catalog schema using discover (if needed or if --rescan).
    2. If it exists, use the 'custom' catalog file in place of the 'raw' catalog.
    3. Create the plan file and 'selected' version of the raw schema using
       `*.rules.txt`.
        - Add primary-key and replication-key to the catalog.json file if specified in the
          rules file.
    4. If infer_custom_schema=true:
        - Use the 'selected' catalog to execute a dry run for 'infer_custom_schema'.
        - Create or update the 'custom' catalog file using inferred schema.
        - Rebuild the plan file and rebuild the 'selected' catalog.

    Parameters:
    -----------
    tap_name : {str}
        The name of the tap without the 'tap-' prefix.
    dockerized : {bool}
        If specified, will override the default behavior for the local platform.
    tap_exe : {str}
        Specifies the tap executable, if different from `tap-{tap_name}`.
    rescan : {bool}
        True to force a rescan and replace existing metadata.
    infer_custom_schema : {bool}
        True to infer schema by performing a dry run data sync.
    taps_dir: {str}
        The directory containing the rules file.
        (Default=cwd)
    config_dir: {str}
        The default location of config, catalog and other potentially sensitive
        information. (Recommended to be excluded from source control.)
        (Default="${taps_dir}/.secrets")
    config_file : {str}
        The location of the JSON config file which contains config for the specified
        plugin. (Default=f"${config_dir}/${plugin_name}-config.json")
    log_dir : {str}
        Optional. The location to publish logs and other artifacts. If omitted, no
        extra publishing will be performed.
    replication_strategy : {str}
        One of "FULL_TABLE", "INCREMENTAL", or "LOG_BASED"; by default "INCREMENTAL" or
        a value is set in the TAP_{TAPNAME}_REPLICATION_STRATEGY environment variable.

    Raises
    ------
    ValueError
        Raised if an argument value is not within expected domain.
    FileExistsError
        Raised if files do not exist in default locations, or if paths provided do not
        point to valid files.
    """
    config.print_version()

    taps_dir = config.get_taps_dir(taps_dir)
    config_file, tap_settings = config.get_or_create_config(
        f"tap-{tap_name}",
        taps_dir=taps_dir,
        config_dir=config_dir,
        config_file=config_file,
    )
    tap_exe = tap_exe or tap_settings.get("EXE", f"tap-{tap_name}")
    replication_strategy = replication_strategy or tap_settings.get(
        "REPLICATION_STRATEGY", "INCREMENTAL"
    )
    config.validate_replication_strategy(replication_strategy)
    catalog_dir = config.get_tap_output_dir(tap_name, taps_dir)
    log_dir = config.get_log_dir(log_dir)
    raw_catalog_file = config.get_raw_catalog_file(
        taps_dir, catalog_dir, tap_name, allow_custom=True
    )
    selected_catalog_file = f"{catalog_dir}/{tap_name}-catalog-selected.json"
    plan_file = config.get_plan_file(tap_name, taps_dir, required=False)
    if rescan or not uio.file_exists(raw_catalog_file):
        # Run discover, if needed, to get catalog.json (raw)
        _discover(
            tap_name,
            taps_dir,
            config_file=config_file,
            catalog_dir=catalog_dir,
            dockerized=dockerized,
            tap_exe=tap_exe,
        )
    config.push_logs(log_dir, [raw_catalog_file])
    logging.info(f"Using catalog file for initial plan: {raw_catalog_file}")
    rules_file = config.get_rules_file(taps_dir, tap_name)
    _check_rules(
        tap_name=tap_name,
        catalog_file=raw_catalog_file,
        rules_file=rules_file,
        plan_file_out=plan_file,
        selected_catalog_file_out=selected_catalog_file,
        replication_strategy=replication_strategy,
        log_dir=log_dir,
    )
    if infer_custom_schema:
        custom_catalog_file = _infer_schema(
            tap_name,
            taps_dir,
            raw_catalog_file=raw_catalog_file,
            selected_catalog_file=selected_catalog_file,
            config_file=config_file,
            catalog_dir=catalog_dir,
            dockerized=dockerized,
            tap_exe=tap_exe,
        )
        config.push_logs(log_dir, [custom_catalog_file])
        _check_rules(
            tap_name=tap_name,
            catalog_file=custom_catalog_file,
            rules_file=rules_file,
            plan_file_out=plan_file,
            selected_catalog_file_out=selected_catalog_file,
            replication_strategy=replication_strategy,
            log_dir=log_dir,
        )
    _validate_selected_catalog(tap_name, selected_catalog_file=selected_catalog_file)