Beispiel #1
0
def try_get_airflow_context():
    # type: ()-> Optional[AirflowTaskContext]
    # first try to get from spark
    try:
        from_spark = try_get_airflow_context_from_spark_conf()
        if from_spark:
            return from_spark
        else:
            _debug_init_print("couldn't get airflow context from spark")

        # Those env vars are set by airflow before running the operator
        dag_id = os.environ.get("AIRFLOW_CTX_DAG_ID")
        execution_date = os.environ.get("AIRFLOW_CTX_EXECUTION_DATE")
        task_id = os.environ.get("AIRFLOW_CTX_TASK_ID")
        try:
            try_number = _get_try_number()
        except Exception:
            try_number = None

        if dag_id and task_id and execution_date:
            return AirflowTaskContext(
                dag_id=dag_id,
                execution_date=execution_date,
                task_id=task_id,
                try_number=try_number,
            )
        return None
    except Exception:
        return None
Beispiel #2
0
def verify_spark_pre_conditions():
    if spark_tracking_enabled() and _SPARK_ENV_FLAG in os.environ:
        if _is_dbnd_spark_installed():
            return True
        else:
            _debug_init_print("failed to import pyspark or dbnd-spark")
    else:
        _debug_init_print(
            "DBND__ENABLE__SPARK_CONTEXT_ENV or SPARK_ENV_LOADED are not set")
    return False
Beispiel #3
0
def read_spark_environ_config():
    logger.debug("running read_spark_environ_config")

    from pyspark import SparkContext

    _debug_init_print("creating spark context to read spark conf")
    spark_conf = SparkContext.getOrCreate().getConf()
    spark_conf = dict(spark_conf.getAll())
    spark_conf = {key.lstrip("spark.env."): value for key, value in spark_conf.items()}

    return get_environ_config_from_dict(spark_conf, "environ")
Beispiel #4
0
def dbnd_setup_plugin():
    from dbnd_spark.local.local_spark_config import SparkLocalEngineConfig
    from dbnd_spark.spark_bootstrap import dbnd_spark_bootstrap

    register_config_cls(SparkLocalEngineConfig)
    register_config_cls(LivySparkConfig)

    dbnd_spark_bootstrap()

    if has_pyspark_imported() and spark_tracking_enabled():
        config_store = read_spark_environ_config()
        dbnd_config.set_values(config_store, "system")
    else:
        _debug_init_print(
            "spark conf is not loaded since pyspark is not imported or DBND__ENABLE__SPARK_CONTEXT_ENV is not set"
        )
Beispiel #5
0
def try_get_airflow_context_from_spark_conf():
    # type: ()-> Optional[AirflowTaskContext]
    if not spark_tracking_enabled() or _SPARK_ENV_FLAG not in os.environ:
        _debug_init_print(
            "DBND__ENABLE__SPARK_CONTEXT_ENV or SPARK_ENV_LOADED are not set")
        return None

    if not _is_dbnd_spark_installed():
        _debug_init_print("failed to import pyspark or dbnd-spark")
        return None
    try:
        _debug_init_print("creating spark context to get spark conf")
        from pyspark import SparkContext

        conf = SparkContext.getOrCreate().getConf()

        dag_id = conf.get("spark.env.AIRFLOW_CTX_DAG_ID")
        execution_date = conf.get("spark.env.AIRFLOW_CTX_EXECUTION_DATE")
        task_id = conf.get("spark.env.AIRFLOW_CTX_TASK_ID")
        try_number = conf.get("spark.env.AIRFLOW_CTX_TRY_NUMBER")

        if dag_id and task_id and execution_date:
            return AirflowTaskContext(
                dag_id=dag_id,
                execution_date=execution_date,
                task_id=task_id,
                try_number=try_number,
            )
    except Exception as ex:
        logger.info("Failed to get airflow context info from spark job: %s",
                    ex)

    return None
Beispiel #6
0
def dbnd_spark_bootstrap():
    global _DBND_REGISTER_SPARK_TYPES
    if _DBND_REGISTER_SPARK_TYPES:
        return
    # don't run it twice or in recursion
    _DBND_REGISTER_SPARK_TYPES = True

    _workaround_spark_namedtuple_serialization()

    try:
        _debug_init_print("importing pyspark")

        import pyspark
    except import_errors as ex:
        # pyspark is not installed, user will not be able to use pyspark types
        return
    # we register spark types only if we have spark installed
    try:
        from dbnd_spark.spark_targets import dbnd_register_spark_types

        dbnd_register_spark_types()
    except Exception:
        pass
    return