Beispiel #1
0
    def emr_operator(self, dag):
        spark_submit_command = [
            "spark-submit",
            "--master",
            "yarn",
            "--name",
            "{{task.task_id}}",
            "script.py",
            "input.csv",
            "output.csv",
        ]

        step_command = dict(
            Name="{{task.task_id}}",
            ActionOnFailure="CONTINUE",
            HadoopJarStep=dict(Jar="command-runner.jar",
                               Args=spark_submit_command),
        )

        operator = EmrAddStepsOperator(task_id="emr_task",
                                       job_flow_id=1,
                                       steps=[step_command],
                                       dag=dag)
        env = {
            "AIRFLOW_CTX_DAG_ID": "test_dag",
            "AIRFLOW_CTX_EXECUTION_DATE": "emr_task",
            "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000",
            "AIRFLOW_CTX_TRY_NUMBER": "1",
            "AIRFLOW_CTX_UID": get_airflow_instance_uid(),
        }

        with wrap_operator_with_tracking_info(env, operator):
            return operator
Beispiel #2
0
def extract_airflow_context(airflow_context):
    # type: (Dict[str, Any]) -> Optional[AirflowTaskContext]
    """Create AirflowTaskContext for airflow_context dict"""

    task_instance = airflow_context.get("task_instance")
    if task_instance is None:
        return None

    dag_id = task_instance.dag_id
    task_id = task_instance.task_id
    execution_date = str(task_instance.execution_date)
    try_number = task_instance.try_number

    if dag_id and task_id and execution_date:
        return AirflowTaskContext(
            dag_id=dag_id,
            execution_date=execution_date,
            task_id=task_id,
            try_number=try_number,
            context=airflow_context,
            airflow_instance_uid=get_airflow_instance_uid(),
        )

    logger.debug(
        "airflow context from inspect, at least one of those params is missing"
        "dag_id: {}, execution_date: {}, task_id: {}".format(
            dag_id, execution_date, task_id))
    return None
Beispiel #3
0
    def get_plugin_metadata(self) -> PluginMetadata:
        from airflow import version as airflow_version

        import dbnd_airflow

        from dbnd_airflow.export_plugin.compat import get_api_mode

        return PluginMetadata(
            airflow_version=airflow_version.version,
            plugin_version=dbnd_airflow.__version__ + " v2",
            airflow_instance_uid=get_airflow_instance_uid(),
            api_mode=get_api_mode(),
        )
Beispiel #4
0
def get_meta(metrics):
    import flask

    meta = AirflowExportMeta(
        airflow_version=airflow_version,
        plugin_version=" ".join([dbnd_airflow.__version__, "v2"]),
        airflow_instance_uid=get_airflow_instance_uid(),
        api_mode=get_api_mode(),
        request_args=dict(flask.request.args)
        if flask.has_request_context() else {},
        metrics={
            "performance": metrics.get("perf_metrics", {}),
            "sizes": metrics.get("size_metrics", {}),
        },
    )
    return meta
Beispiel #5
0
def context_to_airflow_vars(context, in_env_var_format=False):
    # original_context_to_airflow_vars is created during function override in patch_models()
    params = airflow.utils.operator_helpers._original_context_to_airflow_vars(
        context=context, in_env_var_format=in_env_var_format)
    if in_env_var_format:
        task_run = try_get_current_task_run()  # type: TaskRun
        if task_run:
            params = extend_airflow_ctx_with_dbnd_tracking_info(
                task_run, params)

    try_number = str(context['task_instance'].try_number)
    params.update({
        "AIRFLOW_CTX_TRY_NUMBER": try_number,
        "AIRFLOW_CTX_UID": get_airflow_instance_uid()
    })
    return params
Beispiel #6
0
    def spark_submit_operator(self, dag):
        operator = SparkSubmitOperator(
            task_id="spark_submit_task",
            application="script.py",
            application_args=["input.csv", "output.csv"],
            dag=dag,
        )

        env = {
            "AIRFLOW_CTX_DAG_ID": "test_dag",
            "AIRFLOW_CTX_EXECUTION_DATE": "spark_submit_task",
            "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000",
            "AIRFLOW_CTX_TRY_NUMBER": "1",
            "AIRFLOW_CTX_UID": get_airflow_instance_uid(),
        }

        with wrap_operator_with_tracking_info(env, operator):
            return operator
Beispiel #7
0
def extract_airflow_conf(context):
    task_instance = context.get("task_instance")
    if task_instance is None:
        return {}

    dag_id = task_instance.dag_id
    task_id = task_instance.task_id
    execution_date = str(task_instance.execution_date)
    try_number = str(task_instance.try_number)

    if dag_id and task_id and execution_date:
        return {
            "AIRFLOW_CTX_DAG_ID": dag_id,
            "AIRFLOW_CTX_EXECUTION_DATE": execution_date,
            "AIRFLOW_CTX_TASK_ID": task_id,
            "AIRFLOW_CTX_TRY_NUMBER": try_number,
            "AIRFLOW_CTX_UID": get_airflow_instance_uid(),
        }
    return {}
Beispiel #8
0
def get_airflow_conf(
    dag_id="{{dag.dag_id}}",
    task_id="{{task.task_id}}",
    execution_date="{{ts}}",
    try_number="{{task_instance._try_number}}",
):
    """
    These properties are
        AIRFLOW_CTX_DAG_ID - name of the Airflow DAG to associate a run with
        AIRFLOW_CTX_EXECUTION_DATE - execution_date to associate a run with
        AIRFLOW_CTX_TASK_ID - name of the Airflow Task to associate a run with
        AIRFLOW_CTX_TRY_NUMBER - try number of the Airflow Task to associate a run with
    """
    airflow_conf = {
        "AIRFLOW_CTX_DAG_ID": dag_id,
        "AIRFLOW_CTX_EXECUTION_DATE": execution_date,
        "AIRFLOW_CTX_TASK_ID": task_id,
        "AIRFLOW_CTX_TRY_NUMBER": try_number,
        "AIRFLOW_CTX_UID": get_airflow_instance_uid(),
    }
    airflow_conf.update(get_databand_url_conf())
    return airflow_conf
    def execute(self, context):
        logger.debug("Running dbnd dbnd_task from airflow operator %s",
                     self.task_id)

        dag = context["dag"]
        execution_date = context["execution_date"]
        dag_id = dag.dag_id
        run_uid = get_job_run_uid(
            airflow_instance_uid=get_airflow_instance_uid(),
            dag_id=dag_id,
            execution_date=execution_date,
        )

        # Airflow has updated all relevant fields in Operator definition with XCom values
        # now we can create a real dbnd dbnd_task with real references to dbnd_task
        new_kwargs = {}
        for p_name in self.dbnd_task_params_fields:
            new_kwargs[p_name] = getattr(self, p_name, None)
            # this is the real input value after
            if p_name in self.dbnd_xcom_inputs:
                new_kwargs[p_name] = target(new_kwargs[p_name])

        new_kwargs["_dbnd_disable_airflow_inplace"] = True
        dag_ctrl = self.get_dbnd_dag_ctrl()
        with DatabandContext.context(_context=dag_ctrl.dbnd_context) as dc:
            logger.debug("Running %s with kwargs=%s ", self.task_id,
                         new_kwargs)
            dbnd_task = dc.task_instance_cache.get_task_by_id(
                self.dbnd_task_id)
            # rebuild task with new values coming from xcom->operator
            with dbnd_task.ctrl.task_context(phase=TaskContextPhase.BUILD):
                dbnd_task = dbnd_task.clone(
                    output_params_to_clone=self.dbnd_overridden_output_params,
                    **new_kwargs)

            logger.debug("Creating inplace databand run for driver dump")
            dag_task = Task(task_name=dag.dag_id,
                            task_target_date=execution_date)
            dag_task.set_upstream(dbnd_task)

            # create databand run
            with new_databand_run(
                    context=dc,
                    job_name=dag.dag_id,
                    run_uid=run_uid,
                    existing_run=False,
            ) as dr:  # type: DatabandRun
                dr.run_executor = run_executor = RunExecutor(
                    run=dr,
                    root_task_or_task_name=dag_task,
                    send_heartbeat=False)
                run_executor._init_task_runs_for_execution(
                    task_engine=run_executor.local_engine)

                # dr.driver_task_run.set_task_run_state(state=TaskRunState.RUNNING)
                # "make dag run"
                # dr.root_task_run.set_task_run_state(state=TaskRunState.RUNNING)
                dbnd_task_run = dr.get_task_run_by_id(dbnd_task.task_id)

                needs_databand_run_save = dbnd_task._conf__require_run_dump_file
                if needs_databand_run_save:
                    run_executor.save_run_pickle()

                logger.info(
                    dbnd_task.ctrl.banner("Running task '%s'." %
                                          dbnd_task.task_name,
                                          color="cyan"))
                # should be replaced with  tr._execute call
                dbnd_task_run.runner.execute(airflow_context=context,
                                             handle_sigterm=False)

            logger.debug("Finished to run %s", self)
            result = {
                output_name:
                convert_to_safe_types(getattr(dbnd_task, output_name))
                for output_name in self.dbnd_xcom_outputs
            }
        return result