def test_returns_job(env): with DAG(dag_id='schedule_dataflow_test', start_date=datetime.now(), schedule_interval=None) as dag: task = ScheduleDataflowJobOperator( project=env['project'], template_name='load_vibe_to_lake', job_name='schedule-dataflow-test-{}'.format(int(time.time())), job_parameters={ 'client': 'bluesun', 'table': 'pyr_bluesun_local.tree_user_types', 'dest': '{}:lake.tree_user_types'.format(env['project']) }, dag=dag, task_id='test_task') ti = TaskInstance(task=task, execution_date=datetime.now()) job = task.execute(ti.get_template_context()) assert job['projectId'] == env['project']
def execute_tasks_in_dag(dag, tasks, run_id, execution_date): assert isinstance(dag, DAG) dag_run = dag.create_dagrun(run_id=run_id, state='success', execution_date=execution_date) results = {} for task in tasks: ti = TaskInstance(task=task, execution_date=execution_date) context = ti.get_template_context() context['dag_run'] = dag_run try: results[ti] = task.execute(context) except AirflowSkipException as exc: results[ti] = exc return results
def test_render_log_filename(self): try_number = 1 dag_id = 'test_render_log_filename_dag' task_id = 'test_render_log_filename_task' execution_date = datetime(2016, 1, 1) dag = DAG(dag_id, start_date=execution_date) task = DummyOperator(task_id=task_id, dag=dag) ti = TaskInstance(task=task, execution_date=execution_date) filename_template = "{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log" ts = ti.get_template_context()['ts'] expected_filename = f"{dag_id}/{task_id}/{ts}/{try_number}.log" rendered_filename = helpers.render_log_filename( ti, try_number, filename_template) assert rendered_filename == expected_filename
def _solid(context): # pylint: disable=unused-argument if AIRFLOW_EXECUTION_DATE_STR not in context.pipeline_run.tags: raise DagsterInvariantViolationError( 'Could not find "{AIRFLOW_EXECUTION_DATE_STR}" in {target} tags "{tags}". Please ' 'add "{AIRFLOW_EXECUTION_DATE_STR}" to {target} tags before executing' .format( target="job" if context.pipeline_def.is_graph_job_op_target else "pipeline", AIRFLOW_EXECUTION_DATE_STR=AIRFLOW_EXECUTION_DATE_STR, tags=context.pipeline_run.tags, )) execution_date_str = context.pipeline_run.tags.get( AIRFLOW_EXECUTION_DATE_STR) check.str_param(execution_date_str, "execution_date_str") try: execution_date = dateutil.parser.parse(execution_date_str) except ValueError: raise DagsterInvariantViolationError( 'Could not parse execution_date "{execution_date_str}". Please use datetime format ' "compatible with dateutil.parser.parse.".format( execution_date_str=execution_date_str, )) except OverflowError: raise DagsterInvariantViolationError( 'Date "{execution_date_str}" exceeds the largest valid C integer on the system.' .format(execution_date_str=execution_date_str, )) check.inst_param(execution_date, "execution_date", datetime.datetime) with replace_airflow_logger_handlers(): task_instance = TaskInstance(task=task, execution_date=execution_date) ti_context = (dagster_get_template_context(task_instance, task, execution_date) if not use_airflow_template_context else task_instance.get_template_context()) task.render_template_fields(ti_context) task.execute(ti_context) return None
def test_execute_delete_previous_entry(self, test_input, expected, dag): data = list( map(lambda x: (json.dumps(x), self._start_date), test_input)) self._pg_hook.insert_rows('covid19', data, target_fields=['data', 'day']) self._pg_hook.insert_rows("covid19_stats", [(self._start_date.date(), "Portugal")], target_fields=['day', 'country']) task = Covid19ToAnalytics( dag=dag, task_id="test_task", connection_id='dbt_postgres_instance_raw_data_test') ti = TaskInstance(task=task, execution_date=self._start_date) task.execute(ti.get_template_context()) data = self._pg_hook.get_records(self._sql_select) assert len(data) == expected[0] assert data == expected[1]
def test_poke_context(self, mock_session_send): response = requests.Response() response.status_code = 200 mock_session_send.return_value = response def resp_check(_, execution_date): if execution_date == DEFAULT_DATE: return True raise AirflowException('AirflowException raised here!') task = HttpSensor(task_id='http_sensor_poke_exception', http_conn_id='http_default', endpoint='', request_params={}, response_check=resp_check, timeout=5, poke_interval=1, dag=self.dag) task_instance = TaskInstance(task=task, execution_date=DEFAULT_DATE) task.execute(task_instance.get_template_context())
def test_render_log_filename(self): try_number = 1 dag_id = 'test_render_log_filename_dag' task_id = 'test_render_log_filename_task' execution_date = datetime(2016, 1, 1) dag = DAG(dag_id, start_date=execution_date) task = DummyOperator(task_id=task_id, dag=dag) ti = TaskInstance(task=task, execution_date=execution_date) filename_template = "{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log" ts = ti.get_template_context()['ts'] expected_filename = "{dag_id}/{task_id}/{ts}/{try_number}.log".format(dag_id=dag_id, task_id=task_id, ts=ts, try_number=try_number) rendered_filename = helpers.render_log_filename(ti, try_number, filename_template) self.assertEqual(rendered_filename, expected_filename)
def test_capitalize_letters(test_dag, caplog): """ Tests the CapitalizeLetters task. To test operators, we need three pieces (with examples used here): 1. The dag (test_dag fixture) 2. The task (instance of CapitalizeLetters) 3. The task instance (ti) We execute the task instance and any returned values get stored in `result` for testing. """ task = CapitalizeLetters( task_id="capitalize", letters="hey everyone", dag=test_dag, ) ti = TaskInstance(task=task, execution_date=datetime.now()) result = task.execute(ti.get_template_context()) assert result == "HEY EVERYONE"
def test_my_operator(): """ To run this test you need to have at least configured airflow in local mode and run: `airflow initdb` """ airflow = pytest.importorskip("airflow") from airflow import DAG from airflow.models import TaskInstance dag = DAG(dag_id="foo", start_date=datetime.now()) task = DominoOperator( dag=dag, task_id="foo", project=TEST_PROJECT, isDirect=True, command=["python -V"], ) ti = TaskInstance(task=task, execution_date=datetime.now()) task.execute(ti.get_template_context())
def _run_airflow_op(Op, *op_args, **op_kwargs): from airflow.utils import db db.initdb() from datetime import datetime from airflow import DAG, settings from airflow.models import TaskInstance, Variable, XCom dag = DAG(dag_id='anydag', start_date=datetime.now()) task = Op(*op_args, **op_kwargs, dag=dag, task_id='anytask') ti = TaskInstance(task=task, execution_date=datetime.now()) result = task.execute(ti.get_template_context()) variables = { var.id: var.val for var in settings.Session().query(Variable).all() } xcoms = { msg.key: msg.value for msg in settings.Session().query(XCom).all() } return (result, variables, xcoms)
def test_tasks(task_to_run, capfd): "Tests that dbt tasks in scope operate as expected" expected_result_dict = { "t1": [ "\n\tA connection with `conn_id`=my_gcp_connection is newly created\n\n", "\n\tA connection with `conn_id`=my_gcp_connection already exists\n\n", ], "t2": [ "\n\tA connection with `conn_id`=gcr_docker_connection is newly created\n\n", "\n\tA connection with `conn_id`=gcr_docker_connection already exists\n\n", ], } try: task = getattr(test_dag, task_to_run) # dynamically call attribute function call ti = TaskInstance(task=task, execution_date=datetime.now()) result = task.execute(ti.get_template_context()) out, err = capfd.readouterr() assert (out == expected_result_dict.get(task_to_run)[0] ) # dynamic assertion key value pair except AssertionError: assert out == expected_result_dict.get(task_to_run)[1]
def test_run_airflow_dag(scaffold_dag): '''This test runs the sample Airflow dag using the TaskInstance API, directly from Python''' _n, _p, _d, static_path, editable_path = scaffold_dag execution_date = datetime.datetime.utcnow() import_module_from_path('demo_pipeline_static__scaffold', static_path) demo_pipeline = import_module_from_path('demo_pipeline', editable_path) _dag, tasks = demo_pipeline.make_dag( dag_id=demo_pipeline.DAG_ID, dag_description=demo_pipeline.DAG_DESCRIPTION, dag_kwargs=dict(default_args=demo_pipeline.DEFAULT_ARGS, **demo_pipeline.DAG_KWARGS), s3_conn_id=demo_pipeline.S3_CONN_ID, modified_docker_operator_kwargs=demo_pipeline.MODIFIED_DOCKER_OPERATOR_KWARGS, host_tmp_dir=demo_pipeline.HOST_TMP_DIR, ) # These are in topo order already for task in tasks: ti = TaskInstance(task=task, execution_date=execution_date) context = ti.get_template_context() task.execute(context)
def test_parse_bucket_key_from_jinja(self, mock_hook): mock_hook.return_value.check_for_key.return_value = False Variable.set("test_bucket_key", "s3://bucket/key") execution_date = datetime(2020, 1, 1) dag = DAG("test_s3_key", start_date=execution_date) op = S3KeySensor( task_id='s3_key_sensor', bucket_key='{{ var.value.test_bucket_key }}', bucket_name=None, dag=dag, ) ti = TaskInstance(task=op, execution_date=execution_date) context = ti.get_template_context() ti.render_templates(context) op.poke(None) self.assertEqual(op.bucket_key, "key") self.assertEqual(op.bucket_name, "bucket")
def test_execute_data_operator_csv_read_and_plasma_write(self): # given plasma_connector = PlasmaConnector(socket_name) dag = DAG(dag_id='test', start_date=datetime.now()) input_csv_unit = DataInputFileUnit('data/X.csv', sep=';') output_plasma_unit = DataOutputPlasmaUnit(plasma_connector, object_id) task = DataOperator(operation_function=drop_na_dataframe, params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']}, input_unit=input_csv_unit, output_unit=output_plasma_unit, dag=dag, task_id='data_operator_csv_to_parquet') task_instance = TaskInstance(task=task, execution_date=datetime.now()) # when task.execute(task_instance.get_template_context()) # then other_plasma_connector = PlasmaConnector(socket_name) df_transformed = other_plasma_connector.get_dataframe(object_id) self.assertEqual((10245, 27), df_transformed.shape)
def test_set_checkpoint_current_checkpoint_prefetch_has_data_true( env, bigquery_helper, seed): dag_id = 'set_checkpoint_test' table = 'lake.tree_users' seeds = [('system', [('checkpoint', [{ 'dag_id': dag_id, 'table': table, 'checkpoint': '1970-05-03 11:23:00+00:00' }])])] seed(seeds) task_id = 'set_checkpoint_no_current_record' with DAG(dag_id=dag_id, start_date=datetime.now()) as dag: task = SetCheckpointOperator(env=env['env'], table=table, dag=dag, task_id=task_id) ti = TaskInstance(task=task, execution_date=datetime.now()) ti.xcom_push(key=table, value={ 'first_ingestion_timestamp': '1970-01-01 00:00:00+00:00', 'last_ingestion_timestamp': '2020-05-03 11:23:00+00:00', 'has_data': True }) task.execute(ti.get_template_context()) rs = bigquery_helper.query( f"SELECT * FROM {env['project']}.system.checkpoint WHERE table = '{table}'" ) assert str(rs[0]['checkpoint']) == '2020-05-03 11:23:00+00:00'
def execute_tasks_in_dag(dag, tasks, run_id, execution_date): assert isinstance(dag, DAG) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) handler.setFormatter(logging.Formatter(LOG_FORMAT)) root = logging.getLogger("airflow.task.operators") root.setLevel(logging.DEBUG) root.addHandler(handler) dag_run = dag.create_dagrun(run_id=run_id, state="success", execution_date=execution_date) results = {} for task in tasks: ti = TaskInstance(task=task, execution_date=execution_date) context = ti.get_template_context() context["dag_run"] = dag_run try: results[ti] = task.execute(context) except AirflowSkipException as exc: results[ti] = exc return results
def test_execute(dag): task = SomethingToSomethingOperator(dag=dag, task_id="test_task") ti = TaskInstance(task=task, execution_date=task.start_date) ret = task.execute(ti.get_template_context()) assert ret == "p_a_s_s"
def test_execute(self): dag = DAG(dag_id='any_dag', start_date=datetime.now()) task = CubeOperator(my_input_val=3, dag=dag, task_id='any_task') task_inst = TaskInstance(task=task, execution_date=datetime.now()) result = task.execute(task_inst.get_template_context()) self.assertEqual(result, 27)
def get_link(self, operator, dttm): ti = TaskInstance(task=operator, execution_date=dttm) operator.render_template_fields(ti.get_template_context()) query = {"dag_id": operator.external_dag_id, "execution_date": dttm.isoformat()} return build_airflow_url_with_query(query)
def test_infer_predictions(): ti = TaskInstance(task=infer_predictions, execution_date=datetime.now()) result = infer_predictions.execute(ti.get_template_context()) assert result == "succeeded"
def test_skip_task(setup_output_path): out_dir = get_out_dir(cfg_name=cfg_name) dag = DAG('test_dyn', default_args=default_args, schedule_interval=timedelta(days=1)) Variable.set('create_file' + '_hash', 0) def callable1_create_file(log, in_files, out_files, **op_kwargs): with open(out_files['test_file_dyn_location'].path, 'w') as file: file.write("testing dynamic paths") return 'succeeded' return 'failed' # Creating the output file manually with open(osp.join(out_dir, 'test_data.txt'), 'w') as file: file.write("test_data file content") t1_output_files = { 'test_file_dyn_location': ResourcePathDynamic(path=[('var', cfg_name + 'out_dir'), ('const', 'test_data.txt')]) } create_file_forced = PythonPersistentOperator( task_id='create_file', force_execution=True, python_callable=callable1_create_file, output_files=t1_output_files, dag=dag, cfg_name=cfg_name) ti1 = TaskInstance(task=create_file_forced, execution_date=datetime.now()) result1 = create_file_forced.execute(ti1.get_template_context()) assert result1 == 'succeeded' create_file_not_forced = PythonPersistentOperator( task_id='create_file', force_execution=False, python_callable=callable1_create_file, output_files=t1_output_files, dag=dag, cfg_name=cfg_name) ti1 = TaskInstance(task=create_file_not_forced, execution_date=datetime.now()) result1 = create_file_not_forced.execute(ti1.get_template_context()) assert result1 == 'skipped' # Should run: new params some_task_params = { 'start': (parser.gettimestamp(cfg_name, 'train_start'), HASH_IT), 'end': (parser.gettimestamp(cfg_name, 'train_end'), NO_HASH), 'interval_width': (parser.gettimedelta(cfg_name, 'train_interval_width'), HASH_IT), 'interval_overlap': (parser.gettimedelta(cfg_name, 'train_interval_overlap'), HASH_IT) } create_file_not_forced = PythonPersistentOperator( task_id='create_file', force_execution=False, python_callable=callable1_create_file, ppo_kwargs=some_task_params, output_files=t1_output_files, dag=dag, cfg_name=cfg_name) ti1 = TaskInstance(task=create_file_not_forced, execution_date=datetime.now()) result1 = create_file_not_forced.execute(ti1.get_template_context()) assert result1 == 'succeeded' # should skip: no hashed params changed some_other_task_params = { 'start': (parser.gettimestamp(cfg_name, 'train_start'), HASH_IT), 'end': (parser.gettimestamp(cfg_name, 'train_start'), NO_HASH), 'interval_width': (parser.gettimedelta(cfg_name, 'train_interval_width'), HASH_IT), 'interval_overlap': (parser.gettimedelta(cfg_name, 'train_interval_overlap'), HASH_IT) } create_file_not_forced = PythonPersistentOperator( task_id='create_file', force_execution=False, python_callable=callable1_create_file, ppo_kwargs=some_other_task_params, output_files=t1_output_files, dag=dag, cfg_name=cfg_name) ti1 = TaskInstance(task=create_file_not_forced, execution_date=datetime.now()) result1 = create_file_not_forced.execute(ti1.get_template_context()) assert result1 == "skipped" # should run: task params changed some_other_task_params = { 'start': (parser.gettimestamp(cfg_name, 'train_end'), HASH_IT), 'end': (parser.gettimestamp(cfg_name, 'train_end'), NO_HASH), 'interval_width': (parser.gettimedelta(cfg_name, 'train_interval_width'), HASH_IT), 'interval_overlap': (parser.gettimedelta(cfg_name, 'train_interval_overlap'), HASH_IT) } create_file_not_forced = PythonPersistentOperator( task_id='create_file', force_execution=False, python_callable=callable1_create_file, ppo_kwargs=some_other_task_params, output_files=t1_output_files, dag=dag, cfg_name=cfg_name) ti1 = TaskInstance(task=create_file_not_forced, execution_date=datetime.now()) result1 = create_file_not_forced.execute(ti1.get_template_context()) assert result1 == 'succeeded'
def test_resource_dynamic_path(setup_output_path): dag = DAG('test_dyn', default_args=default_args, schedule_interval=timedelta(days=1)) def callable1_create_file(tp, in_files, out_files, *op_args, **op_kwargs): pathlib.Path(osp.dirname( out_files['test_file_dyn_location'].path)).mkdir(parents=True, exist_ok=True) with open(out_files['test_file_dyn_location'].path, 'w') as file: file.write("testing dynamic paths") return 'succeeded' return 'failed' def callable2_read_file(tp, in_files, out_files, *op_args, **op_kwargs): with open(in_files['test_file_dyn_location'].path, 'r') as file: assert file.readline( ) == 'testing dynamic paths', 'Invalid file content!' return 'succeeded' return 'failed' t1_output_files = { 'test_file_dyn_location': ResourcePathDynamic( path=[('var', cfg_name + 'out_dir'), ('var', cfg_name + 'create_file_hash'), ( 'const', 'training'), ('const', 'test_data.txt')]) } create_file = PythonPersistentOperator( task_id='create_file', force_execution=True, python_callable=callable1_create_file, output_files=t1_output_files, dag=dag, cfg_name=cfg_name) t2_input_files = { 'test_file_dyn_location': ResourcePathDynamic( path=[('var', cfg_name + 'out_dir'), ('var', cfg_name + 'create_file_hash'), ( 'const', 'training'), ('const', 'test_data.txt')]) } read_file = PythonPersistentOperator(task_id='read_file', force_execution=True, python_callable=callable2_read_file, input_files=t2_input_files, dag=dag, cfg_name=cfg_name) ti1 = TaskInstance(task=create_file, execution_date=datetime.now()) ti2 = TaskInstance(task=read_file, execution_date=datetime.now()) result1 = create_file.execute(ti1.get_template_context()) result2 = read_file.execute(ti2.get_template_context()) assert result1 == 'succeeded' assert result2 == 'succeeded'
def test_create_inference_dataset_task(): ti = TaskInstance(task=create_inference_dataset, execution_date=datetime.now()) result = create_inference_dataset.execute(ti.get_template_context()) assert result == "succeeded"
def _airflow(op_class, kwargs_str=''): result_output_name = 'Result' variables_dict_output_name = 'Variables' xcoms_dict_output_name = 'XComs' variables_to_output = None xcoms_to_output = None variables_output_names = variables_to_output or [] xcoms_output_names = xcoms_to_output or [] from airflow.utils import db db.initdb() import json kwargs = json.loads(kwargs_str) from datetime import datetime from airflow import DAG, settings from airflow.models import TaskInstance, Variable, XCom import logging import importlib import sys root = logging.getLogger() root.setLevel(logging.DEBUG) execution_date = datetime.now() dag = DAG(dag_id='anydag', start_date=execution_date) Op = getattr(importlib.import_module("airflow.operators"), op_class) if 'python_callable' in kwargs: exec(kwargs['python_callable']) kwargs['python_callable'] = python_callable task = Op(dag=dag, task_id='anytask', **kwargs) ti = TaskInstance(task=task, execution_date=execution_date) result = task.execute(ti.get_template_context()) variables = { var.id: var.val for var in settings.Session().query(Variable).all() } xcoms = { msg.key: msg.value for msg in settings.Session().query(XCom).all() } output_values = {} if result_output_name is not None: output_values[result_output_name] = str(result) if variables_dict_output_name is not None: output_values[variables_dict_output_name] = json.dumps( variables) if xcoms_dict_output_name is not None: output_values[xcoms_dict_output_name] = json.dumps(xcoms) for name in variables_output_names: output_values[name] = variables[name] for name in xcoms_output_names: output_values[name] = xcoms[name] logging.info('Output: %s' % output_values) return output_values
def test_train_graph_model_task(): ti = TaskInstance(task=train_graph_model, execution_date=datetime.now()) result = train_graph_model.execute(ti.get_template_context()) assert result == "succeeded"
def test_create_interval_metrics(): ti = TaskInstance(task=create_interval_metrics, execution_date=datetime.now()) result = create_interval_metrics.execute(ti.get_template_context()) print("---") assert result == "succeeded"
def test_execute(self): dag = DAG(dag_id='anydag', start_date=datetime.now()) task = MultiplyBy5Operator(my_operator_param=10, dag=dag, task_id='anytask') ti = TaskInstance(task=task, execution_date=datetime.now()) result = task.execute(ti.get_template_context()) self.assertEqual(result, 50)
def test_infer_graph_model(): ti = TaskInstance(task=create_graph_model_node_embeddings, execution_date=datetime.now()) result = create_graph_model_node_embeddings.execute(ti.get_template_context()) assert result == "succeeded"