Ejemplo n.º 1
0
def log_snowflake_table(
    table_name,  # type: str
    connection_string,  # type: str
    database,  # type: str
    schema,  # type: str
    with_preview=True,
    with_schema=True,
):

    try:
        if not is_plugin_enabled("dbnd-snowflake",
                                 module_import="dbnd_snowflake"):
            return
        from dbnd_snowflake import snowflake_values

        with log_duration("log_snowflake_table__time_seconds",
                          source="system"):
            conn_params = snowflake_values.conn_str_to_conn_params(
                connection_string)
            account = conn_params["account"]
            user = conn_params["user"]
            password = conn_params["password"]

            snowflake_table = snowflake_values.SnowflakeTable(
                account, user, password, database, schema, table_name)

        log_data(
            table_name,
            snowflake_table,
            with_preview=with_preview,
            with_schema=with_schema,
            with_histograms=False,
        )
    except Exception as ex:
        logger.exception("Failed to log_snowflake_table")
Ejemplo n.º 2
0
    def signal_handler(signum, frame):
        logger.info("Task runner received signal. Exiting...")
        if is_plugin_enabled("dbnd-docker") and is_plugin_enabled(
                "dbnd-airflow"):
            from dbnd_docker.kubernetes.kubernetes_engine_config import (
                ENV_DBND_POD_NAME, )

            if ENV_DBND_POD_NAME in os.environ:
                # We are running inside cluster
                # We should log all events on sigterm for debugging when running inside cluster
                from dbnd_airflow_contrib.kubernetes_metrics_logger import (
                    log_pod_events_on_sigterm, )

                log_pod_events_on_sigterm(frame)

        raise DatabandSigTermError("Task received signal",
                                   help_msg="Probably the job was canceled")
Ejemplo n.º 3
0
def should_log_pg_histogram(luigi_task):
    # type: (luigi.Task) -> bool
    if not is_plugin_enabled("dbnd-postgres", module_import="dbnd_postgres"):
        return False
    try:
        from luigi.contrib.postgres import PostgresQuery
    except ImportError:
        return False
    return isinstance(luigi_task, PostgresQuery)
Ejemplo n.º 4
0
def dbnd_setup_plugin():
    # register configs
    from dbnd_aws.env import AwsEnvConfig

    register_config_cls(AwsEnvConfig)

    if is_plugin_enabled("dbnd-spark"):
        from dbnd_aws.emr.emr_config import EmrConfig

        register_config_cls(EmrConfig)

    if is_plugin_enabled("dbnd-docker"):
        from dbnd_aws.batch.aws_batch_ctrl import AwsBatchConfig

        register_config_cls(AwsBatchConfig)

    from dbnd_aws.fs import build_s3_fs_client

    register_file_system(FileSystems.s3, build_s3_fs_client)
Ejemplo n.º 5
0
def dbnd_setup_plugin():
    from dbnd_gcp.dataflow.dataflow_config import DataflowConfig
    from dbnd_gcp.env import GcpEnvConfig

    register_config_cls(GcpEnvConfig)
    register_config_cls(DataflowConfig)

    if is_plugin_enabled("dbnd-spark"):
        from dbnd_gcp.dataproc.dataproc_config import DataprocConfig

        register_config_cls(DataprocConfig)

    register_file_system(FileSystems.gcs, build_gcs_client)
Ejemplo n.º 6
0
def log_snowflake_table(
    table_name: str,
    connection_string: Union[str, SnowflakeConnection],
    database: str,
    schema: str,
    key: Optional[str] = None,
    with_preview: Optional[bool] = None,
    with_schema: Optional[bool] = None,
    raise_on_error: bool = False,
):
    """

    :param table_name: table name
    :param connection_string: either connection_string or actual connection
    :param database:
    :param schema:
    :param key:
    :param with_preview:
    :param with_schema:
    :param raise_on_error:
    :return:
    """
    if not is_plugin_enabled("dbnd-snowflake", module_import="dbnd_snowflake"):
        return
    from dbnd_snowflake import snowflake_values

    with log_duration("log_snowflake_table__time_seconds",
                      source="system"), SnowflakeController(
                          connection_string) as snowflake_ctrl:
        config = SnowflakeConfig()
        snowflake_table = snowflake_values.SnowflakeTable(
            snowflake_ctrl,
            database,
            schema,
            table_name,
            config.table_preview_rows,
        )
        log_data(
            key or table_name,
            snowflake_table,
            with_preview=with_preview,
            with_schema=with_schema,
            with_size=with_schema,
            with_histograms=False,
            raise_on_error=raise_on_error,
        )
Ejemplo n.º 7
0
def log_snowflake_table(
    table_name: str,
    connection_string: str,
    database: str,
    schema: str,
    key: Optional[str] = None,
    with_preview: Optional[bool] = None,
    with_schema: Optional[bool] = None,
    raise_on_error: bool = False,
):

    if not is_plugin_enabled("dbnd-snowflake", module_import="dbnd_snowflake"):
        return
    from dbnd_snowflake import snowflake_values

    with log_duration("log_snowflake_table__time_seconds", source="system"):
        conn_params = snowflake_values.conn_str_to_conn_params(connection_string)
        account = conn_params["account"]
        user = conn_params["user"]
        password = conn_params["password"]

        config = SnowflakeConfig()
        snowflake_table = snowflake_values.SnowflakeTable(
            account,
            user,
            password,
            database,
            schema,
            table_name,
            config.table_preview_rows,
        )
        log_data(
            key or table_name,
            snowflake_table,
            with_preview=with_preview,
            with_schema=with_schema,
            with_size=with_schema,
            with_histograms=False,
            raise_on_error=raise_on_error,
        )
Ejemplo n.º 8
0
def log_snowflake_table_targets(
    table_op: TableTargetOperation,
    connection_string: Union[str, SnowflakeConnection],
    with_preview: Optional[bool] = None,
    with_schema: Optional[bool] = None,
):
    if not is_plugin_enabled("dbnd-snowflake", module_import="dbnd_snowflake"):
        return

    from dbnd_snowflake.snowflake_values import SnowflakeTable

    with SnowflakeController(connection_string) as snowflake_ctrl:
        snowflake_table = SnowflakeTable.from_table(snowflake_ctrl,
                                                    table_op.name)
        log_dataset_op(
            op_path=table_op.path,
            op_type=table_op.operation,
            success=table_op.success,
            data=snowflake_table,
            with_preview=with_preview,
            with_schema=with_schema,
        )
Ejemplo n.º 9
0
def log_pg_table(
    table_name,
    connection_string,
    with_preview=None,  # type: Optional[bool]
    with_schema=None,  # type: Optional[bool]
    with_histograms=None,  # type: Union[LogDataRequest, bool, str, List[str]]
):

    try:
        if not is_plugin_enabled("dbnd-postgres", module_import="dbnd_postgres"):
            return
        from dbnd_postgres import postgres_values

        pg_table = postgres_values.PostgresTable(table_name, connection_string)
        log_data(
            table_name,
            pg_table,
            with_preview=with_preview,
            with_schema=with_schema,
            with_histograms=with_histograms,
        )
    except Exception:
        logger.exception("Failed to log_pg_table")
Ejemplo n.º 10
0
def log_pg_table(
        table_name,
        connection_string,
        with_preview=None,  # type: Optional[bool]
        with_schema=None,  # type: Optional[bool]
        with_histograms=None,  # type: Union[LogDataRequest, bool, str, List[str]]
):
    """
    Log the data of postgres table to dbnd.

    @param table_name: name of the table to log
    @param connection_string: a connection string used to reach the table.
    @param with_preview: True if should log a preview of the table.
    @param with_schema: True if should log the schema of the table.
    @param with_histograms: True if should calculate and log histogram of the table data.
    """
    try:
        if not is_plugin_enabled("dbnd-postgres",
                                 module_import="dbnd_postgres"):
            logger.warning(
                "Can't log postgres table: dbnd-postgres package is not installed\n"
                "Help: pip install dbnd-postgres")
            return

        from dbnd_postgres import postgres_values

        pg_table = postgres_values.PostgresTable(table_name, connection_string)
        log_data(
            table_name,
            pg_table,
            with_preview=with_preview,
            with_schema=with_schema,
            with_histograms=with_histograms,
        )
    except Exception:
        logger.exception("Failed to log_pg_table")
Ejemplo n.º 11
0
def calculate_task_executor_type(submit_tasks, remote_engine, settings):
    run_config = settings.run
    parallel = run_config.parallel
    task_executor_type = run_config.task_executor_type

    if task_executor_type is None:
        if is_airflow_enabled():
            from dbnd_airflow.executors import AirflowTaskExecutorType

            task_executor_type = AirflowTaskExecutorType.airflow_inprocess
        else:
            task_executor_type = TaskExecutorType.local

    if is_airflow_enabled():
        from dbnd_airflow.executors import AirflowTaskExecutorType

        if parallel:
            if task_executor_type == TaskExecutorType.local:
                logger.warning(
                    "Auto switching to engine type '%s' due to parallel mode.",
                    AirflowTaskExecutorType.airflow_multiprocess_local,
                )
                task_executor_type = AirflowTaskExecutorType.airflow_multiprocess_local

            if task_executor_type == AirflowTaskExecutorType.airflow_inprocess:
                logger.warning(
                    "Auto switching to engine type '%s' due to parallel mode.",
                    AirflowTaskExecutorType.airflow_multiprocess_local,
                )
                task_executor_type = AirflowTaskExecutorType.airflow_multiprocess_local

            if (task_executor_type
                    == AirflowTaskExecutorType.airflow_multiprocess_local
                    or task_executor_type
                    == AirflowTaskExecutorType.airflow_kubernetes):
                if "sqlite" in settings.core.sql_alchemy_conn:
                    if settings.run.enable_concurent_sqlite:
                        logger.warning(
                            "You are running parallel execution on top of sqlite database! (see run.enable_concurent_sqlite)"
                        )
                    else:
                        # in theory sqlite can support a decent amount of parallelism, but in practice
                        # the way airflow works each process holds the db exlusively locked which leads
                        # to sqlite DB is locked exceptions
                        raise friendly_error.execute_engine.parallel_or_remote_sqlite(
                            "parallel")

        if is_plugin_enabled("dbnd-docker"):
            from dbnd_docker.kubernetes.kubernetes_engine_config import (
                KubernetesEngineConfig, )

            if (submit_tasks
                    and isinstance(remote_engine, KubernetesEngineConfig)
                    and run_config.enable_airflow_kubernetes):
                if task_executor_type != AirflowTaskExecutorType.airflow_kubernetes:
                    logger.info(
                        "Using dedicated kubernetes executor for this run")
                    task_executor_type = AirflowTaskExecutorType.airflow_kubernetes
                    parallel = True
    else:
        if parallel:
            logger.warning(
                "Airflow is not installed, parallel mode is not supported")

    all_executor_types = [TaskExecutorType.local]
    if is_airflow_enabled():
        from dbnd_airflow.executors import AirflowTaskExecutorType

        all_executor_types.extend(AirflowTaskExecutorType.all())

    if task_executor_type not in all_executor_types:
        raise DatabandConfigError("Unsupported engine type %s" %
                                  task_executor_type)

    return task_executor_type, parallel