def setUp(self): dummy_dag = models.DAG(dag_id='my_component', start_date=datetime.datetime(2019, 1, 1)) self.checkcache_op = dummy_operator.DummyOperator( task_id='my_component.checkcache', dag=dummy_dag) self.tfx_python_op = dummy_operator.DummyOperator( task_id='my_component.pythonexec', dag=dummy_dag) self.tfx_docker_op = dummy_operator.DummyOperator( task_id='my_component.dockerexec', dag=dummy_dag) self.publishcache_op = dummy_operator.DummyOperator( task_id='my_component.publishcache', dag=dummy_dag) self.publishexec_op = dummy_operator.DummyOperator( task_id='my_component.publishexec', dag=dummy_dag) self.parent_dag = airflow_pipeline.AirflowPipeline( pipeline_name='pipeline_name', start_date=datetime.datetime(2018, 1, 1), schedule_interval=None, pipeline_root='pipeline_root', metadata_db_root='metadata_db_root', metadata_connection_config=None, additional_pipeline_args=None, docker_operator_cfg=None, enable_cache=True, log_root='log_root') self.input_dict = {'i': [TfxType('i')]} self.output_dict = {'o': [TfxType('o')]} self.exec_properties = {'e': 'e'} self.driver_options = {'d': 'd'}
def setUp(self): self._temp_dir = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()) dummy_dag = models.DAG(dag_id='my_component', start_date=datetime.datetime(2019, 1, 1)) self.checkcache_op = dummy_operator.DummyOperator( task_id='my_component.checkcache', dag=dummy_dag) self.tfx_python_op = dummy_operator.DummyOperator( task_id='my_component.pythonexec', dag=dummy_dag) self.noop_sink_op = dummy_operator.DummyOperator( task_id='my_component.noop_sink', dag=dummy_dag) self.publishexec_op = dummy_operator.DummyOperator( task_id='my_component.publishexec', dag=dummy_dag) self._logger_config = logging_utils.LoggerConfig() self.parent_dag = airflow_pipeline.AirflowPipeline( pipeline_name='pipeline_name', start_date=datetime.datetime(2018, 1, 1), schedule_interval=None, pipeline_root='pipeline_root', metadata_db_root=self._temp_dir, metadata_connection_config=None, additional_pipeline_args=None, enable_cache=True) self.input_dict = {'i': [TfxArtifact('i')]} self.output_dict = {'o': [TfxArtifact('o')]} self.exec_properties = {'e': 'e'} self.driver_options = {'d': 'd'}
def __init__(self, component_name, task_id, parent_dag, input_dict, output_dict, exec_properties, driver_options, driver_class, executor_class, additional_pipeline_args, metadata_connection_config, logger_config): super(_TfxWorker, self).__init__( dag_id=task_id, schedule_interval=None, start_date=parent_dag.start_date, user_defined_filters={'b64encode': base64.b64encode}) adaptor = airflow_adapter.AirflowAdapter( component_name=component_name, input_dict=input_dict, output_dict=output_dict, exec_properties=exec_properties, driver_options=driver_options, driver_class=driver_class, executor_class=executor_class, additional_pipeline_args=additional_pipeline_args, metadata_connection_config=metadata_connection_config, logger_config=logger_config) # Before the executor runs, check if the artifact already exists checkcache_op = python_operator.BranchPythonOperator( task_id=task_id + '.checkcache', provide_context=True, python_callable=adaptor.check_cache_and_maybe_prepare_execution, op_kwargs={ 'uncached_branch': task_id + '.exec', 'cached_branch': task_id + '.noop_sink', }, dag=self) tfx_op = python_operator.PythonOperator( task_id=task_id + '.exec', provide_context=True, python_callable=adaptor.python_exec, op_kwargs={ 'cache_task_name': task_id + '.checkcache', }, dag=self) noop_sink_op = dummy_operator.DummyOperator(task_id=task_id + '.noop_sink', dag=self) publishexec_op = python_operator.PythonOperator( task_id=task_id + '.publishexec', provide_context=True, python_callable=adaptor.publish_exec, op_kwargs={ 'cache_task_name': task_id + '.checkcache', 'exec_task_name': task_id + '.exec', }, dag=self) tfx_op.set_upstream(checkcache_op) publishexec_op.set_upstream(tfx_op) noop_sink_op.set_upstream(checkcache_op)
def test_dag_has_correct_tasks(self, unused_gcs_mock, unused_bq_mock, mock_configuration): """Test if module has tasks.""" mock_configuration.get.return_value = 'test_path' # Create dummy tasks for expected DAG expected_dag = models.DAG( dag_id='expected_dag', schedule_interval='0 12 * * *', start_date=datetime.datetime(2018, 1, 8)) expected_task_ids = [ 'bq-to-tfrecord', 'make-predictions', 'gcs-to-bigquery', 'gcs-delete-blob' ] for task_id in expected_task_ids: dummy_operator.DummyOperator(task_id=task_id, dag=expected_dag) actual_dag = dag_module.create_dag(self.test_env_variables) self.assertEqual(actual_dag.task_count, expected_dag.task_count) self.assertListEqual(sorted(actual_dag.task_ids), sorted(expected_task_ids))
except IOError as e: logger.error( 'Error opening table_list_file %s: ' % str(table_list_file), e) # -------------------------------------------------------------------------------- # Main DAG # -------------------------------------------------------------------------------- # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG('composer_sample_bq_copy_across_locations', default_args=default_args, schedule_interval=None) as dag: start = dummy_operator.DummyOperator(task_id='start', trigger_rule='all_success') end = dummy_operator.DummyOperator(task_id='end', trigger_rule='all_success') # Get the table list from master file all_records = read_table_list(table_list_file_path) # Loop over each record in the 'all_records' python list to build up # Airflow tasks for record in all_records: logger.info('Generating tasks to transfer table: {}'.format(record)) table_source = record['table_source'] table_dest = record['table_dest']
# # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """An example DAG demonstrating use of variables and how to test it.""" import datetime from airflow import models from airflow.operators import bash_operator from airflow.operators import dummy_operator yesterday = datetime.datetime.now() - datetime.timedelta(days=1) default_dag_args = { 'start_date': yesterday, } with models.DAG('composer_sample_cycle', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: start = dummy_operator.DummyOperator(task_id='start') end = dummy_operator.DummyOperator(task_id='end') variable_example = bash_operator.BashOperator( task_id='variable_example', bash_command='echo project_id=' + models.Variable.get('gcp_project'))
import airflow import datetime from airflow import DAG from airflow.operators import bash_operator, dummy_operator default_args = { 'owner': 'Nitin Ware', 'depends_on_past': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'start_date': airflow.utils.dates.days_ago(1), } dag = DAG( 'bash_dag', 'catchup=False', default_args=default_args, schedule_interval="@once", ) start_dag = dummy_operator.DummyOperator( task_id='start', dag=dag, ) bash_dag = bash_operator.BashOperator(task_id='bash_command', bash_command='echo Hello Bash.', dag=dag) start_dag >> bash_dag
remove_cluster = dataproc_operator.DataprocClusterDeleteOperator( project_id=PROJECT, task_id="delete_cluster", cluster_name='vf-polimi-demo', region='europe-west1') def check_batch_kpi_scheduled_cluster_running(**kwargs): ti = kwargs['ti'] xcom_value = ti.xcom_pull(task_ids='batch_kpi_scheduled_cluster') if xcom_value == "vf-polimi-demo": return 'delete_cluster' else: return 'end' branch_batch_kpi_scheduled_active_cluster = BranchPythonOperator( task_id='check_batch_kpi_scheduled_cluster', provide_context=True, python_callable=check_batch_kpi_scheduled_cluster_running) batch_kpi_scheduled_cluster_running = bash_operator.BashOperator( task_id='batch_kpi_scheduled_cluster', bash_command= "gcloud dataproc clusters list --region europe-west1 | grep 'vf-polimi-demo'| awk '{print $1; exit}'", xcom_push=True, trigger_rule="all_done") end_pipeline = dummy_operator.DummyOperator(task_id='end') create_dataproc_cluster >> run_batch_kpi_scheduled >> batch_kpi_scheduled_cluster_running >> branch_batch_kpi_scheduled_active_cluster >> [ remove_cluster, end_pipeline ]
'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'start_date': TOMORROW, } dag = DAG( 'Airflow_Bigquery', default_args=default_args, description= 'Load and transform data from Google cloud storage to Google bigquery with Airflow', ) start_operator = dummy_operator.DummyOperator(task_id='Begin_execution', dag=dag) create_dataset = bash_operator.BashOperator( task_id='create_airflow_iot_dataset', bash_command='bq mk iot', dag=dag) load_csv = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq', bucket='bucket1_hazem', source_objects=['heartRate-final.csv'], destination_project_dataset_table='iot.heartRateTable', trigger_rule='all_done', skip_leading_rows=1, schema_fields=[ { 'name': 'sensorID', 'type': 'STRING',
print('Greetings from SpikeySales! Happy shopping.') return 'Greeting successfully printed.' def makeBranchChoice(): """ Randomly choose between 'hello_spikey' & 'dummy' branches. Either one will run but not both. """ x = random.randint(1, 5) if (x <= 2): return 'hello_spikey' else: return 'dummy' run_this_first = dummy_operator.DummyOperator(task_id='run_this_first') # BranchPythonOperator takes in a callable which returns the task id of the next task. branching = python_operator.BranchPythonOperator( task_id='branching', python_callable=makeBranchChoice) run_this_first >> branching spikeysales_greeting = python_operator.PythonOperator( task_id='hello_spikey', python_callable=greeting) dummy_followed_python = dummy_operator.DummyOperator( task_id='follow_python') dummy = dummy_operator.DummyOperator(task_id='dummy')
def call(self, dag): tasks = [fop(dag) for fop in self.fops] t = dummy_operator.DummyOperator(task_id=self.id, dag=dag) t.set_upstream(tasks) return t
def test_build_graph(self): r"""Tests building airflow DAG graph using add_node_to_graph(). The dependency graph beside is as below: component_one / \ / \ component_two component_three \ / \ / component_four """ component_one = dummy_operator.DummyOperator( task_id='one', dag=self.pipeline) component_two = dummy_operator.DummyOperator( task_id='two', dag=self.pipeline) component_three = dummy_operator.DummyOperator( task_id='three', dag=self.pipeline) component_four = dummy_operator.DummyOperator( task_id='four', dag=self.pipeline) component_one_input_a = TfxType('i1a') component_one_input_b = TfxType('i1b') component_one_output_a = TfxType('o1a') component_one_output_b = TfxType('o1b') component_two_output = TfxType('o2') component_three_output = TfxType('o3') component_four_output = TfxType('o4') component_one_input_dict = { 'i1a': [component_one_input_a], 'i1b': [component_one_input_b] } component_one_output_dict = { 'o1a': [component_one_output_a], 'o1b': [component_one_output_b] } component_two_input_dict = { 'i2a': [component_one_output_a], 'i2b': [component_one_output_b] } component_two_output_dict = {'o2': [component_two_output]} component_three_input_dict = { 'i3a': [component_one_output_a], 'i3b': [component_one_output_b] } component_three_output_dict = {'o3': [component_two_output]} component_four_input_dict = { 'i4a': [component_two_output], 'i4b': [component_three_output] } component_four_output_dict = {'o4': [component_four_output]} self.pipeline.add_node_to_graph( component_one, consumes=component_one_input_dict.values(), produces=component_one_output_dict.values()) self.pipeline.add_node_to_graph( component_two, consumes=component_two_input_dict.values(), produces=component_two_output_dict.values()) self.pipeline.add_node_to_graph( component_three, consumes=component_three_input_dict.values(), produces=component_three_output_dict.values()) self.pipeline.add_node_to_graph( component_four, consumes=component_four_input_dict.values(), produces=component_four_output_dict.values()) self.assertItemsEqual(component_one.upstream_list, []) self.assertItemsEqual(component_two.upstream_list, [component_one]) self.assertItemsEqual(component_three.upstream_list, [component_one]) self.assertItemsEqual(component_four.upstream_list, [component_two, component_three])
def convert_to_airflow_op(self): return dummy_operator.DummyOperator(task_id=self.task_id, trigger_rule=self.trigger_rule)
# # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """An example DAG demonstrating a cyle in the task IDs.""" import datetime from airflow import models from airflow.operators import dummy_operator # If you are running Airflow in more than one time zone # see https://airflow.apache.org/docs/apache-airflow/stable/timezone.html # for best practices yesterday = datetime.datetime.now() - datetime.timedelta(days=1) default_dag_args = { 'start_date': yesterday, } with models.DAG('composer_sample_cycle', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: start = dummy_operator.DummyOperator(task_id='oops_a_cycle') end = dummy_operator.DummyOperator(task_id='oops_a_cycle') start >> end
'project_id': gcp_project } with models.DAG('product_table', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: bq_make_raw_dataset = bash_operator.BashOperator( task_id='make_bq_raw_dataset', bash_command= 'bq --location=asia-southeast1 ls {} || bq --location=asia-southeast1 mk {}' .format(bq_raw_dataset_name, bq_raw_dataset_name)) raw_sql_files = read_sql_from_gcs(bq_raw_dataset_name, gcs_bucket) bq_start_making_raw_tables = dummy_operator.DummyOperator( task_id='start_making_raw_tables') bq_end_making_raw_tables = dummy_operator.DummyOperator( task_id='end_making_raw_tables') for filename in raw_sql_files: sql_statement = raw_sql_files[filename].decode() table_name = filename.replace('.sql', '') table_name = table_name.replace('raw/', '') bq_make_raw_tables = bigquery_operator.BigQueryOperator( task_id='make_raw_table_{}'.format(table_name), sql=sql_statement, use_legacy_sql=False, location='asia-southeast1') bq_start_making_raw_tables >> bq_make_raw_tables bq_make_raw_tables >> bq_end_making_raw_tables
with models.DAG( 'running_python_bash_and_dummy_operator', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: def hello_world(): print('Hello World!') return 1 def greeting(): print('Greetings from SpikeySales! Happy shopping.') return 'Greeting successfully printed.' hello_world_greeting = python_operator.PythonOperator( task_id='python_1', python_callable=hello_world) spikeysales_greeting = python_operator.PythonOperator( task_id='python_2', python_callable=greeting) bash_greeting = bash_operator.BashOperator( task_id='bye_bash', bash_command='echo Goodbye! Hope to see you soon.') end = dummy_operator.DummyOperator( task_id='dummy') hello_world_greeting >> spikeysales_greeting >> bash_greeting >> end
schedule_interval="@once", ) task_default = bigquery_operator.BigQueryOperator( task_id='task_default_connection', bql='SELECT 1', use_legacy_sql=False, dag=dag) task_explicit = bigquery_operator.BigQueryOperator( task_id='task_explicit_connection', bql='SELECT 1', use_legacy_sql=False, bigquery_conn_id='google_cloud_default', dag=dag) task_custom = bigquery_operator.BigQueryOperator( task_id='task_custom_connection', bql='SELECT 1', use_legacy_sql=False, bigquery_conn_id='my_gcp_connection', dag=dag) start_task = dummy_operator.DummyOperator( task_id='start', default_args=default_args, dag=dag, ) start_task >> [task_default, task_explicit, task_custom]
def get_job_from_xcom(**kwargs): job_id = json.loads( kwargs['ti'].xcom_pull(task_ids='start_dataprep'))['id'] return job_id # -------------------------------------------------------------------------------- # Main DAG # -------------------------------------------------------------------------------- dag = models.DAG(dag_id='demo_etl', default_args=default_args, schedule_interval=None) start = dummy_operator.DummyOperator(task_id='start', trigger_rule='all_success', dag=dag) tables_deleted = dummy_operator.DummyOperator(task_id='tables_deleted', trigger_rule='all_success', dag=dag) data_collected = dummy_operator.DummyOperator(task_id='data_collected', trigger_rule='all_success', dag=dag) end = dummy_operator.DummyOperator(task_id='end', trigger_rule='all_success', dag=dag) delete_jobs = []