예제 #1
0
 def setUp(self):
     dummy_dag = models.DAG(dag_id='my_component',
                            start_date=datetime.datetime(2019, 1, 1))
     self.checkcache_op = dummy_operator.DummyOperator(
         task_id='my_component.checkcache', dag=dummy_dag)
     self.tfx_python_op = dummy_operator.DummyOperator(
         task_id='my_component.pythonexec', dag=dummy_dag)
     self.tfx_docker_op = dummy_operator.DummyOperator(
         task_id='my_component.dockerexec', dag=dummy_dag)
     self.publishcache_op = dummy_operator.DummyOperator(
         task_id='my_component.publishcache', dag=dummy_dag)
     self.publishexec_op = dummy_operator.DummyOperator(
         task_id='my_component.publishexec', dag=dummy_dag)
     self.parent_dag = airflow_pipeline.AirflowPipeline(
         pipeline_name='pipeline_name',
         start_date=datetime.datetime(2018, 1, 1),
         schedule_interval=None,
         pipeline_root='pipeline_root',
         metadata_db_root='metadata_db_root',
         metadata_connection_config=None,
         additional_pipeline_args=None,
         docker_operator_cfg=None,
         enable_cache=True,
         log_root='log_root')
     self.input_dict = {'i': [TfxType('i')]}
     self.output_dict = {'o': [TfxType('o')]}
     self.exec_properties = {'e': 'e'}
     self.driver_options = {'d': 'd'}
예제 #2
0
 def setUp(self):
     self._temp_dir = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR',
                                     self.get_temp_dir())
     dummy_dag = models.DAG(dag_id='my_component',
                            start_date=datetime.datetime(2019, 1, 1))
     self.checkcache_op = dummy_operator.DummyOperator(
         task_id='my_component.checkcache', dag=dummy_dag)
     self.tfx_python_op = dummy_operator.DummyOperator(
         task_id='my_component.pythonexec', dag=dummy_dag)
     self.noop_sink_op = dummy_operator.DummyOperator(
         task_id='my_component.noop_sink', dag=dummy_dag)
     self.publishexec_op = dummy_operator.DummyOperator(
         task_id='my_component.publishexec', dag=dummy_dag)
     self._logger_config = logging_utils.LoggerConfig()
     self.parent_dag = airflow_pipeline.AirflowPipeline(
         pipeline_name='pipeline_name',
         start_date=datetime.datetime(2018, 1, 1),
         schedule_interval=None,
         pipeline_root='pipeline_root',
         metadata_db_root=self._temp_dir,
         metadata_connection_config=None,
         additional_pipeline_args=None,
         enable_cache=True)
     self.input_dict = {'i': [TfxArtifact('i')]}
     self.output_dict = {'o': [TfxArtifact('o')]}
     self.exec_properties = {'e': 'e'}
     self.driver_options = {'d': 'd'}
예제 #3
0
    def __init__(self, component_name, task_id, parent_dag, input_dict,
                 output_dict, exec_properties, driver_options, driver_class,
                 executor_class, additional_pipeline_args,
                 metadata_connection_config, logger_config):
        super(_TfxWorker, self).__init__(
            dag_id=task_id,
            schedule_interval=None,
            start_date=parent_dag.start_date,
            user_defined_filters={'b64encode': base64.b64encode})
        adaptor = airflow_adapter.AirflowAdapter(
            component_name=component_name,
            input_dict=input_dict,
            output_dict=output_dict,
            exec_properties=exec_properties,
            driver_options=driver_options,
            driver_class=driver_class,
            executor_class=executor_class,
            additional_pipeline_args=additional_pipeline_args,
            metadata_connection_config=metadata_connection_config,
            logger_config=logger_config)
        # Before the executor runs, check if the artifact already exists
        checkcache_op = python_operator.BranchPythonOperator(
            task_id=task_id + '.checkcache',
            provide_context=True,
            python_callable=adaptor.check_cache_and_maybe_prepare_execution,
            op_kwargs={
                'uncached_branch': task_id + '.exec',
                'cached_branch': task_id + '.noop_sink',
            },
            dag=self)
        tfx_op = python_operator.PythonOperator(
            task_id=task_id + '.exec',
            provide_context=True,
            python_callable=adaptor.python_exec,
            op_kwargs={
                'cache_task_name': task_id + '.checkcache',
            },
            dag=self)
        noop_sink_op = dummy_operator.DummyOperator(task_id=task_id +
                                                    '.noop_sink',
                                                    dag=self)
        publishexec_op = python_operator.PythonOperator(
            task_id=task_id + '.publishexec',
            provide_context=True,
            python_callable=adaptor.publish_exec,
            op_kwargs={
                'cache_task_name': task_id + '.checkcache',
                'exec_task_name': task_id + '.exec',
            },
            dag=self)

        tfx_op.set_upstream(checkcache_op)
        publishexec_op.set_upstream(tfx_op)
        noop_sink_op.set_upstream(checkcache_op)
예제 #4
0
  def test_dag_has_correct_tasks(self, unused_gcs_mock, unused_bq_mock,
                                 mock_configuration):
    """Test if module has tasks."""
    mock_configuration.get.return_value = 'test_path'
    # Create dummy tasks for expected DAG
    expected_dag = models.DAG(
        dag_id='expected_dag',
        schedule_interval='0 12 * * *',
        start_date=datetime.datetime(2018, 1, 8))
    expected_task_ids = [
        'bq-to-tfrecord', 'make-predictions', 'gcs-to-bigquery',
        'gcs-delete-blob'
    ]
    for task_id in expected_task_ids:
      dummy_operator.DummyOperator(task_id=task_id, dag=expected_dag)

    actual_dag = dag_module.create_dag(self.test_env_variables)

    self.assertEqual(actual_dag.task_count, expected_dag.task_count)
    self.assertListEqual(sorted(actual_dag.task_ids), sorted(expected_task_ids))
예제 #5
0
    except IOError as e:
        logger.error(
            'Error opening table_list_file %s: ' % str(table_list_file), e)


# --------------------------------------------------------------------------------
# Main DAG
# --------------------------------------------------------------------------------

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG('composer_sample_bq_copy_across_locations',
                default_args=default_args,
                schedule_interval=None) as dag:
    start = dummy_operator.DummyOperator(task_id='start',
                                         trigger_rule='all_success')

    end = dummy_operator.DummyOperator(task_id='end',
                                       trigger_rule='all_success')

    # Get the table list from master file
    all_records = read_table_list(table_list_file_path)

    # Loop over each record in the 'all_records' python list to build up
    # Airflow tasks
    for record in all_records:
        logger.info('Generating tasks to transfer table: {}'.format(record))

        table_source = record['table_source']
        table_dest = record['table_dest']
예제 #6
0
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An example DAG demonstrating use of variables and how to test it."""

import datetime

from airflow import models
from airflow.operators import bash_operator
from airflow.operators import dummy_operator

yesterday = datetime.datetime.now() - datetime.timedelta(days=1)

default_dag_args = {
    'start_date': yesterday,
}

with models.DAG('composer_sample_cycle',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:
    start = dummy_operator.DummyOperator(task_id='start')
    end = dummy_operator.DummyOperator(task_id='end')
    variable_example = bash_operator.BashOperator(
        task_id='variable_example',
        bash_command='echo project_id=' + models.Variable.get('gcp_project'))
예제 #7
0
import airflow
import datetime
from airflow import DAG
from airflow.operators import bash_operator, dummy_operator

default_args = {
    'owner': 'Nitin Ware',
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'start_date': airflow.utils.dates.days_ago(1),
}

dag = DAG(
    'bash_dag',
    'catchup=False',
    default_args=default_args,
    schedule_interval="@once",
)

start_dag = dummy_operator.DummyOperator(
    task_id='start',
    dag=dag,
)

bash_dag = bash_operator.BashOperator(task_id='bash_command',
                                      bash_command='echo Hello Bash.',
                                      dag=dag)

start_dag >> bash_dag
예제 #8
0
    remove_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        project_id=PROJECT,
        task_id="delete_cluster",
        cluster_name='vf-polimi-demo',
        region='europe-west1')

    def check_batch_kpi_scheduled_cluster_running(**kwargs):
        ti = kwargs['ti']
        xcom_value = ti.xcom_pull(task_ids='batch_kpi_scheduled_cluster')
        if xcom_value == "vf-polimi-demo":
            return 'delete_cluster'
        else:
            return 'end'

    branch_batch_kpi_scheduled_active_cluster = BranchPythonOperator(
        task_id='check_batch_kpi_scheduled_cluster',
        provide_context=True,
        python_callable=check_batch_kpi_scheduled_cluster_running)

    batch_kpi_scheduled_cluster_running = bash_operator.BashOperator(
        task_id='batch_kpi_scheduled_cluster',
        bash_command=
        "gcloud dataproc clusters list --region europe-west1 | grep 'vf-polimi-demo'| awk '{print $1; exit}'",
        xcom_push=True,
        trigger_rule="all_done")

    end_pipeline = dummy_operator.DummyOperator(task_id='end')

    create_dataproc_cluster >> run_batch_kpi_scheduled >> batch_kpi_scheduled_cluster_running >> branch_batch_kpi_scheduled_active_cluster >> [
        remove_cluster, end_pipeline
    ]
예제 #9
0
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'start_date': TOMORROW,
}

dag = DAG(
    'Airflow_Bigquery',
    default_args=default_args,
    description=
    'Load and transform data from Google cloud storage to Google bigquery with Airflow',
)

start_operator = dummy_operator.DummyOperator(task_id='Begin_execution',
                                              dag=dag)

create_dataset = bash_operator.BashOperator(
    task_id='create_airflow_iot_dataset', bash_command='bq mk iot', dag=dag)

load_csv = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
    task_id='gcs_to_bq',
    bucket='bucket1_hazem',
    source_objects=['heartRate-final.csv'],
    destination_project_dataset_table='iot.heartRateTable',
    trigger_rule='all_done',
    skip_leading_rows=1,
    schema_fields=[
        {
            'name': 'sensorID',
            'type': 'STRING',
예제 #10
0
        print('Greetings from SpikeySales! Happy shopping.')
        return 'Greeting successfully printed.'

    def makeBranchChoice():
        """
        Randomly choose between 'hello_spikey' & 'dummy' branches. Either one will run but not both.
        """
        x = random.randint(1, 5)

        if (x <= 2):
            return 'hello_spikey'

        else:
            return 'dummy'

    run_this_first = dummy_operator.DummyOperator(task_id='run_this_first')

    # BranchPythonOperator takes in a callable which returns the task id of the next task.
    branching = python_operator.BranchPythonOperator(
        task_id='branching', python_callable=makeBranchChoice)

    run_this_first >> branching

    spikeysales_greeting = python_operator.PythonOperator(
        task_id='hello_spikey', python_callable=greeting)

    dummy_followed_python = dummy_operator.DummyOperator(
        task_id='follow_python')

    dummy = dummy_operator.DummyOperator(task_id='dummy')
예제 #11
0
 def call(self, dag):
     tasks = [fop(dag) for fop in self.fops]
     t = dummy_operator.DummyOperator(task_id=self.id, dag=dag)
     t.set_upstream(tasks)
     return t
예제 #12
0
  def test_build_graph(self):
    r"""Tests building airflow DAG graph using add_node_to_graph().

    The dependency graph beside is as below:
                     component_one
                     /           \
                    /             \
          component_two         component_three
                    \             /
                     \           /
                     component_four
    """

    component_one = dummy_operator.DummyOperator(
        task_id='one', dag=self.pipeline)
    component_two = dummy_operator.DummyOperator(
        task_id='two', dag=self.pipeline)
    component_three = dummy_operator.DummyOperator(
        task_id='three', dag=self.pipeline)
    component_four = dummy_operator.DummyOperator(
        task_id='four', dag=self.pipeline)

    component_one_input_a = TfxType('i1a')
    component_one_input_b = TfxType('i1b')
    component_one_output_a = TfxType('o1a')
    component_one_output_b = TfxType('o1b')
    component_two_output = TfxType('o2')
    component_three_output = TfxType('o3')
    component_four_output = TfxType('o4')

    component_one_input_dict = {
        'i1a': [component_one_input_a],
        'i1b': [component_one_input_b]
    }
    component_one_output_dict = {
        'o1a': [component_one_output_a],
        'o1b': [component_one_output_b]
    }
    component_two_input_dict = {
        'i2a': [component_one_output_a],
        'i2b': [component_one_output_b]
    }
    component_two_output_dict = {'o2': [component_two_output]}
    component_three_input_dict = {
        'i3a': [component_one_output_a],
        'i3b': [component_one_output_b]
    }
    component_three_output_dict = {'o3': [component_two_output]}
    component_four_input_dict = {
        'i4a': [component_two_output],
        'i4b': [component_three_output]
    }
    component_four_output_dict = {'o4': [component_four_output]}

    self.pipeline.add_node_to_graph(
        component_one,
        consumes=component_one_input_dict.values(),
        produces=component_one_output_dict.values())
    self.pipeline.add_node_to_graph(
        component_two,
        consumes=component_two_input_dict.values(),
        produces=component_two_output_dict.values())
    self.pipeline.add_node_to_graph(
        component_three,
        consumes=component_three_input_dict.values(),
        produces=component_three_output_dict.values())
    self.pipeline.add_node_to_graph(
        component_four,
        consumes=component_four_input_dict.values(),
        produces=component_four_output_dict.values())

    self.assertItemsEqual(component_one.upstream_list, [])
    self.assertItemsEqual(component_two.upstream_list, [component_one])
    self.assertItemsEqual(component_three.upstream_list, [component_one])
    self.assertItemsEqual(component_four.upstream_list,
                          [component_two, component_three])
예제 #13
0
 def convert_to_airflow_op(self):
     return dummy_operator.DummyOperator(task_id=self.task_id,
                                         trigger_rule=self.trigger_rule)
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An example DAG demonstrating a cyle in the task IDs."""

import datetime

from airflow import models
from airflow.operators import dummy_operator

# If you are running Airflow in more than one time zone
# see https://airflow.apache.org/docs/apache-airflow/stable/timezone.html
# for best practices
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)

default_dag_args = {
    'start_date': yesterday,
}

with models.DAG('composer_sample_cycle',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:
    start = dummy_operator.DummyOperator(task_id='oops_a_cycle')
    end = dummy_operator.DummyOperator(task_id='oops_a_cycle')
    start >> end
예제 #15
0
    'project_id': gcp_project
}

with models.DAG('product_table',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    bq_make_raw_dataset = bash_operator.BashOperator(
        task_id='make_bq_raw_dataset',
        bash_command=
        'bq --location=asia-southeast1 ls {} || bq --location=asia-southeast1 mk {}'
        .format(bq_raw_dataset_name, bq_raw_dataset_name))

    raw_sql_files = read_sql_from_gcs(bq_raw_dataset_name, gcs_bucket)

    bq_start_making_raw_tables = dummy_operator.DummyOperator(
        task_id='start_making_raw_tables')

    bq_end_making_raw_tables = dummy_operator.DummyOperator(
        task_id='end_making_raw_tables')

    for filename in raw_sql_files:
        sql_statement = raw_sql_files[filename].decode()
        table_name = filename.replace('.sql', '')
        table_name = table_name.replace('raw/', '')
        bq_make_raw_tables = bigquery_operator.BigQueryOperator(
            task_id='make_raw_table_{}'.format(table_name),
            sql=sql_statement,
            use_legacy_sql=False,
            location='asia-southeast1')
        bq_start_making_raw_tables >> bq_make_raw_tables
        bq_make_raw_tables >> bq_end_making_raw_tables
예제 #16
0
with models.DAG(
        'running_python_bash_and_dummy_operator',
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    def hello_world():
        print('Hello World!')
        return 1

    def greeting():
        print('Greetings from SpikeySales! Happy shopping.')
        return 'Greeting successfully printed.'

    hello_world_greeting = python_operator.PythonOperator(
        task_id='python_1',
        python_callable=hello_world)
    

    spikeysales_greeting = python_operator.PythonOperator(
        task_id='python_2',
        python_callable=greeting)

    bash_greeting = bash_operator.BashOperator(
        task_id='bye_bash',
        bash_command='echo Goodbye! Hope to see you soon.')

    end = dummy_operator.DummyOperator(
        task_id='dummy')

    hello_world_greeting >> spikeysales_greeting >> bash_greeting >> end
예제 #17
0
    schedule_interval="@once",
)

task_default = bigquery_operator.BigQueryOperator(
    task_id='task_default_connection',
    bql='SELECT 1',
    use_legacy_sql=False,
    dag=dag)

task_explicit = bigquery_operator.BigQueryOperator(
    task_id='task_explicit_connection',
    bql='SELECT 1',
    use_legacy_sql=False,
    bigquery_conn_id='google_cloud_default',
    dag=dag)

task_custom = bigquery_operator.BigQueryOperator(
    task_id='task_custom_connection',
    bql='SELECT 1',
    use_legacy_sql=False,
    bigquery_conn_id='my_gcp_connection',
    dag=dag)

start_task = dummy_operator.DummyOperator(
    task_id='start',
    default_args=default_args,
    dag=dag,
)

start_task >> [task_default, task_explicit, task_custom]
예제 #18
0
def get_job_from_xcom(**kwargs):
    job_id = json.loads(
        kwargs['ti'].xcom_pull(task_ids='start_dataprep'))['id']
    return job_id


# --------------------------------------------------------------------------------
# Main DAG
# --------------------------------------------------------------------------------

dag = models.DAG(dag_id='demo_etl',
                 default_args=default_args,
                 schedule_interval=None)

start = dummy_operator.DummyOperator(task_id='start',
                                     trigger_rule='all_success',
                                     dag=dag)

tables_deleted = dummy_operator.DummyOperator(task_id='tables_deleted',
                                              trigger_rule='all_success',
                                              dag=dag)

data_collected = dummy_operator.DummyOperator(task_id='data_collected',
                                              trigger_rule='all_success',
                                              dag=dag)

end = dummy_operator.DummyOperator(task_id='end',
                                   trigger_rule='all_success',
                                   dag=dag)

delete_jobs = []