Example #1
0
    def test_exec_failure(self, db_mock_class):
        """
        Test the execute function in case where the run failed.
        """
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.submit_run.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED',
                                                      '')

        with self.assertRaises(AirflowException):
            op.execute(None)

        expected = databricks_operator._deep_string_coerce({
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
            'run_name': TASK_ID,
        })
        db_mock_class.assert_called_once_with(
            DEFAULT_CONN_ID,
            retry_limit=op.databricks_retry_limit,
            retry_delay=op.databricks_retry_delay)
        db_mock.submit_run.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        self.assertEqual(RUN_ID, op.run_id)
Example #2
0
    def test_exec_success(self, db_mock_class):
        """
        Test the execute function in case where the run is successful.
        """
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        db_mock.submit_run.return_value = 1
        db_mock.get_run_state.return_value = RunState('TERMINATED', 'SUCCESS',
                                                      '')

        op.execute(None)

        expected = databricks_operator._deep_string_coerce({
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
            'run_name': TASK_ID
        })
        db_mock_class.assert_called_once_with(
            DEFAULT_CONN_ID,
            retry_limit=op.databricks_retry_limit,
            retry_delay=op.databricks_retry_delay)

        db_mock.submit_run.assert_called_once_with(expected)
        db_mock.get_run_page_url.assert_called_once_with(RUN_ID)
        db_mock.get_run_state.assert_called_once_with(RUN_ID)
        assert RUN_ID == op.run_id
Example #3
0
 def test_init_with_bad_type(self):
     json = {'test': datetime.now()}
     # Looks a bit weird since we have to escape regex reserved symbols.
     exception_message = r'Type \<(type|class) \'datetime.datetime\'\> used ' + \
                         r'for parameter json\[test\] is not a number or a string'
     with self.assertRaisesRegex(AirflowException, exception_message):
         DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
Example #4
0
 def test_init_with_specified_run_name(self):
     """
     Test the initializer with a specified run_name.
     """
     json = {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': RUN_NAME}
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
     expected = databricks_operator._deep_string_coerce(
         {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': RUN_NAME}
     )
     self.assertDictEqual(expected, op.json)
Example #5
0
    def test_on_kill(self, db_mock_class):
        run = {
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
        }
        op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run)
        db_mock = db_mock_class.return_value
        op.run_id = RUN_ID

        op.on_kill()

        db_mock.cancel_run.assert_called_once_with(RUN_ID)
Example #6
0
    def test_init_with_spark_python_task_named_parameters(self):
        """
        Test the initializer with the named parameters.
        """
        op = DatabricksSubmitRunOperator(
            task_id=TASK_ID, new_cluster=NEW_CLUSTER, spark_python_task=SPARK_PYTHON_TASK
        )
        expected = databricks_operator._deep_string_coerce(
            {'new_cluster': NEW_CLUSTER, 'spark_python_task': SPARK_PYTHON_TASK, 'run_name': TASK_ID}
        )

        self.assertDictEqual(expected, op.json)
Example #7
0
 def test_init_with_json(self):
     """
     Test the initializer with json data.
     """
     json = {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK}
     op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
     expected = databricks_operator._deep_string_coerce({
         'new_cluster': NEW_CLUSTER,
         'notebook_task': NOTEBOOK_TASK,
         'run_name': TASK_ID
     })
     assert expected == op.json
Example #8
0
    def test_init_with_named_parameters(self):
        """
        Test the initializer with the named parameters.
        """
        op = DatabricksSubmitRunOperator(task_id=TASK_ID,
                                         new_cluster=NEW_CLUSTER,
                                         notebook_task=NOTEBOOK_TASK)
        expected = databricks_operator._deep_string_coerce({
            'new_cluster': NEW_CLUSTER,
            'notebook_task': NOTEBOOK_TASK,
            'run_name': TASK_ID
        })

        self.assertDictEqual(expected, op.json)
Example #9
0
 def test_init_with_templating(self):
     json = {
         'new_cluster': NEW_CLUSTER,
         'notebook_task': TEMPLATED_NOTEBOOK_TASK,
     }
     dag = DAG('test', start_date=datetime.now())
     op = DatabricksSubmitRunOperator(dag=dag, task_id=TASK_ID, json=json)
     op.render_template_fields(context={'ds': DATE})
     expected = databricks_operator._deep_string_coerce({
         'new_cluster': NEW_CLUSTER,
         'notebook_task': RENDERED_TEMPLATED_NOTEBOOK_TASK,
         'run_name': TASK_ID,
     })
     self.assertDictEqual(expected, op.json)
Example #10
0
 def test_init_with_merging(self):
     """
     Test the initializer when json and other named parameters are both
     provided. The named parameters should override top level keys in the
     json dict.
     """
     override_new_cluster = {'workers': 999}
     json = {
         'new_cluster': NEW_CLUSTER,
         'notebook_task': NOTEBOOK_TASK,
     }
     op = DatabricksSubmitRunOperator(task_id=TASK_ID,
                                      json=json,
                                      new_cluster=override_new_cluster)
     expected = databricks_operator._deep_string_coerce({
         'new_cluster': override_new_cluster,
         'notebook_task': NOTEBOOK_TASK,
         'run_name': TASK_ID,
     })
     self.assertDictEqual(expected, op.json)
Example #11
0
    new_cluster = {
        'spark_version': '2.1.0-db3-scala2.11',
        'node_type_id': 'r3.xlarge',
        'aws_attributes': {
            'availability': 'ON_DEMAND'
        },
        'num_workers': 8
    }

    notebook_task_params = {
        'new_cluster': new_cluster,
        'notebook_task': {
            'notebook_path': '/Users/[email protected]/PrepareData',
        },
    }
    # Example of using the JSON parameter to initialize the operator.
    notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task',
                                                json=notebook_task_params)

    # Example of using the named parameters of DatabricksSubmitRunOperator
    # to initialize the operator.
    spark_jar_task = DatabricksSubmitRunOperator(
        task_id='spark_jar_task',
        new_cluster=new_cluster,
        spark_jar_task={'main_class_name': 'com.example.ProcessData'},
        libraries=[{
            'jar': 'dbfs:/lib/etl-0.1.jar'
        }])

    notebook_task >> spark_jar_task
Example #12
0
) as dag:
    new_cluster = {
        'spark_version': '6.4.x-scala2.11',
        'node_type_id': 'r3.xlarge',
        'aws_attributes': {'availability': 'ON_DEMAND'},
        'num_workers': 2,
    }

    notebook_task_params1 = {
        'new_cluster': new_cluster,
        'notebook_task': {
            'notebook_path': '/Users/[email protected]/temp/nb1',
        },
    }
    # Example of using the JSON parameter to initialize the operator.
    notebook_task1 = DatabricksSubmitRunOperator(task_id='notebook_task1', json=notebook_task_params1)

    notebook_task_params2 = {
        'new_cluster': new_cluster,
        'notebook_task': {
            'notebook_path': '/Users/[email protected]/temp/nb2',
        },
    }
    # Example of using the JSON parameter to initialize the operator.
    notebook_task2 = DatabricksSubmitRunOperator(task_id='notebook_task2', json=notebook_task_params2)

    notebook_task_params3 = {
        'new_cluster': new_cluster,
        'notebook_task': {
            'notebook_path': '/Users/[email protected]/temp/nb3',
        },
with DAG(
        'orchestration_good_practices',
        start_date=datetime(2021, 1, 1),
        schedule_interval='@daily',
        catchup=False,
        default_args={
            'owner': 'airflow',
            'email_on_failure': False,
            'retries': 1,
            'retry_delay': timedelta(minutes=1)
        },
) as dag:

    opr_refresh_mat_view = PostgresOperator(
        task_id='refresh_mat_view',
        postgres_conn_id='postgres_default',
        sql='REFRESH MATERIALIZED VIEW example_view;',
    )

    opr_submit_run = DatabricksSubmitRunOperator(
        task_id='submit_run',
        databricks_conn_id='databricks',
        new_cluster=new_cluster,
        notebook_task=notebook_task)
    opr_run_now = DatabricksRunNowOperator(task_id='run_now',
                                           databricks_conn_id='databricks',
                                           job_id=5,
                                           notebook_params=notebook_params)

    opr_refresh_mat_view >> opr_submit_run >> opr_run_now
Example #14
0
    update_repo = DatabricksReposUpdateOperator(task_id='update_repo',
                                                repo_path=repo_path,
                                                branch="releases")
    # [END howto_operator_databricks_repo_update]

    notebook_task_params = {
        'new_cluster': {
            'spark_version': '9.1.x-scala2.12',
            'node_type_id': 'r3.xlarge',
            'aws_attributes': {
                'availability': 'ON_DEMAND'
            },
            'num_workers': 8,
        },
        'notebook_task': {
            'notebook_path': f'{repo_path}/PrepareData',
        },
    }

    notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task',
                                                json=notebook_task_params)

    # [START howto_operator_databricks_repo_delete]
    # Example of deleting a Databricks Repo
    repo_path = "/Repos/[email protected]/demo-repo"
    delete_repo = DatabricksReposDeleteOperator(task_id='delete_repo',
                                                repo_path=repo_path)
    # [END howto_operator_databricks_repo_delete]

    (create_repo >> update_repo >> notebook_task >> delete_repo)
    "depends_on_past": False,
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=2),
}

with DAG(
        "databricks_dag",
        start_date=datetime(2021, 1, 1),
        schedule_interval="@daily",
        catchup=False,
        default_args=default_args,
) as dag:

    opr_submit_run = DatabricksSubmitRunOperator(
        task_id="submit_run",
        databricks_conn_id="databricks",
        new_cluster=new_cluster,
        notebook_task=notebook_task,
    )

    opr_run_now = DatabricksRunNowOperator(
        task_id="run_now",
        databricks_conn_id="databricks",
        job_id=5,
        notebook_params=notebook_params,
    )

    opr_submit_run >> opr_run_now