def test_exec_failure(self, db_mock_class): """ Test the execute function in case where the run failed. """ run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value db_mock.submit_run.return_value = 1 db_mock.get_run_state.return_value = RunState('TERMINATED', 'FAILED', '') with self.assertRaises(AirflowException): op.execute(None) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID, }) db_mock_class.assert_called_once_with( DEFAULT_CONN_ID, retry_limit=op.databricks_retry_limit, retry_delay=op.databricks_retry_delay) db_mock.submit_run.assert_called_once_with(expected) db_mock.get_run_page_url.assert_called_once_with(RUN_ID) db_mock.get_run_state.assert_called_once_with(RUN_ID) self.assertEqual(RUN_ID, op.run_id)
def test_exec_success(self, db_mock_class): """ Test the execute function in case where the run is successful. """ run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value db_mock.submit_run.return_value = 1 db_mock.get_run_state.return_value = RunState('TERMINATED', 'SUCCESS', '') op.execute(None) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID }) db_mock_class.assert_called_once_with( DEFAULT_CONN_ID, retry_limit=op.databricks_retry_limit, retry_delay=op.databricks_retry_delay) db_mock.submit_run.assert_called_once_with(expected) db_mock.get_run_page_url.assert_called_once_with(RUN_ID) db_mock.get_run_state.assert_called_once_with(RUN_ID) assert RUN_ID == op.run_id
def test_init_with_bad_type(self): json = {'test': datetime.now()} # Looks a bit weird since we have to escape regex reserved symbols. exception_message = r'Type \<(type|class) \'datetime.datetime\'\> used ' + \ r'for parameter json\[test\] is not a number or a string' with self.assertRaisesRegex(AirflowException, exception_message): DatabricksSubmitRunOperator(task_id=TASK_ID, json=json)
def test_init_with_specified_run_name(self): """ Test the initializer with a specified run_name. """ json = {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': RUN_NAME} op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json) expected = databricks_operator._deep_string_coerce( {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': RUN_NAME} ) self.assertDictEqual(expected, op.json)
def test_on_kill(self, db_mock_class): run = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=run) db_mock = db_mock_class.return_value op.run_id = RUN_ID op.on_kill() db_mock.cancel_run.assert_called_once_with(RUN_ID)
def test_init_with_spark_python_task_named_parameters(self): """ Test the initializer with the named parameters. """ op = DatabricksSubmitRunOperator( task_id=TASK_ID, new_cluster=NEW_CLUSTER, spark_python_task=SPARK_PYTHON_TASK ) expected = databricks_operator._deep_string_coerce( {'new_cluster': NEW_CLUSTER, 'spark_python_task': SPARK_PYTHON_TASK, 'run_name': TASK_ID} ) self.assertDictEqual(expected, op.json)
def test_init_with_json(self): """ Test the initializer with json data. """ json = {'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK} op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID }) assert expected == op.json
def test_init_with_named_parameters(self): """ Test the initializer with the named parameters. """ op = DatabricksSubmitRunOperator(task_id=TASK_ID, new_cluster=NEW_CLUSTER, notebook_task=NOTEBOOK_TASK) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID }) self.assertDictEqual(expected, op.json)
def test_init_with_templating(self): json = { 'new_cluster': NEW_CLUSTER, 'notebook_task': TEMPLATED_NOTEBOOK_TASK, } dag = DAG('test', start_date=datetime.now()) op = DatabricksSubmitRunOperator(dag=dag, task_id=TASK_ID, json=json) op.render_template_fields(context={'ds': DATE}) expected = databricks_operator._deep_string_coerce({ 'new_cluster': NEW_CLUSTER, 'notebook_task': RENDERED_TEMPLATED_NOTEBOOK_TASK, 'run_name': TASK_ID, }) self.assertDictEqual(expected, op.json)
def test_init_with_merging(self): """ Test the initializer when json and other named parameters are both provided. The named parameters should override top level keys in the json dict. """ override_new_cluster = {'workers': 999} json = { 'new_cluster': NEW_CLUSTER, 'notebook_task': NOTEBOOK_TASK, } op = DatabricksSubmitRunOperator(task_id=TASK_ID, json=json, new_cluster=override_new_cluster) expected = databricks_operator._deep_string_coerce({ 'new_cluster': override_new_cluster, 'notebook_task': NOTEBOOK_TASK, 'run_name': TASK_ID, }) self.assertDictEqual(expected, op.json)
new_cluster = { 'spark_version': '2.1.0-db3-scala2.11', 'node_type_id': 'r3.xlarge', 'aws_attributes': { 'availability': 'ON_DEMAND' }, 'num_workers': 8 } notebook_task_params = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/PrepareData', }, } # Example of using the JSON parameter to initialize the operator. notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', json=notebook_task_params) # Example of using the named parameters of DatabricksSubmitRunOperator # to initialize the operator. spark_jar_task = DatabricksSubmitRunOperator( task_id='spark_jar_task', new_cluster=new_cluster, spark_jar_task={'main_class_name': 'com.example.ProcessData'}, libraries=[{ 'jar': 'dbfs:/lib/etl-0.1.jar' }]) notebook_task >> spark_jar_task
) as dag: new_cluster = { 'spark_version': '6.4.x-scala2.11', 'node_type_id': 'r3.xlarge', 'aws_attributes': {'availability': 'ON_DEMAND'}, 'num_workers': 2, } notebook_task_params1 = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/temp/nb1', }, } # Example of using the JSON parameter to initialize the operator. notebook_task1 = DatabricksSubmitRunOperator(task_id='notebook_task1', json=notebook_task_params1) notebook_task_params2 = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/temp/nb2', }, } # Example of using the JSON parameter to initialize the operator. notebook_task2 = DatabricksSubmitRunOperator(task_id='notebook_task2', json=notebook_task_params2) notebook_task_params3 = { 'new_cluster': new_cluster, 'notebook_task': { 'notebook_path': '/Users/[email protected]/temp/nb3', },
with DAG( 'orchestration_good_practices', start_date=datetime(2021, 1, 1), schedule_interval='@daily', catchup=False, default_args={ 'owner': 'airflow', 'email_on_failure': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) }, ) as dag: opr_refresh_mat_view = PostgresOperator( task_id='refresh_mat_view', postgres_conn_id='postgres_default', sql='REFRESH MATERIALIZED VIEW example_view;', ) opr_submit_run = DatabricksSubmitRunOperator( task_id='submit_run', databricks_conn_id='databricks', new_cluster=new_cluster, notebook_task=notebook_task) opr_run_now = DatabricksRunNowOperator(task_id='run_now', databricks_conn_id='databricks', job_id=5, notebook_params=notebook_params) opr_refresh_mat_view >> opr_submit_run >> opr_run_now
update_repo = DatabricksReposUpdateOperator(task_id='update_repo', repo_path=repo_path, branch="releases") # [END howto_operator_databricks_repo_update] notebook_task_params = { 'new_cluster': { 'spark_version': '9.1.x-scala2.12', 'node_type_id': 'r3.xlarge', 'aws_attributes': { 'availability': 'ON_DEMAND' }, 'num_workers': 8, }, 'notebook_task': { 'notebook_path': f'{repo_path}/PrepareData', }, } notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', json=notebook_task_params) # [START howto_operator_databricks_repo_delete] # Example of deleting a Databricks Repo repo_path = "/Repos/[email protected]/demo-repo" delete_repo = DatabricksReposDeleteOperator(task_id='delete_repo', repo_path=repo_path) # [END howto_operator_databricks_repo_delete] (create_repo >> update_repo >> notebook_task >> delete_repo)
"depends_on_past": False, "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=2), } with DAG( "databricks_dag", start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False, default_args=default_args, ) as dag: opr_submit_run = DatabricksSubmitRunOperator( task_id="submit_run", databricks_conn_id="databricks", new_cluster=new_cluster, notebook_task=notebook_task, ) opr_run_now = DatabricksRunNowOperator( task_id="run_now", databricks_conn_id="databricks", job_id=5, notebook_params=notebook_params, ) opr_submit_run >> opr_run_now