def test_should_pass_argument_to_hook(self, mock_hook):
        task = GoogleCloudStorageObjectSensor(
            task_id="task-id",
            bucket=TEST_BUCKET,
            object=TEST_OBJECT,
            google_cloud_conn_id=TEST_GCP_CONN_ID,
            delegate_to=TEST_DELEGATE_TO,
        )
        mock_hook.return_value.exists.return_value = True

        result = task.poke(mock.MagicMock())

        self.assertEqual(True, result)
        mock_hook.assert_called_once_with(
            delegate_to=TEST_DELEGATE_TO,
            google_cloud_storage_conn_id=TEST_GCP_CONN_ID
        )
        mock_hook.return_value.exists.assert_called_once_with(TEST_BUCKET, TEST_OBJECT)
Beispiel #2
0
    def add_load_tasks(task, file_format, allow_quoted_newlines=False):

        wait_sensor = GoogleCloudStorageObjectSensor(
            task_id='wait_latest_{task}'.format(task=task),
            timeout=60 * 60,
            poke_interval=60,
            bucket=output_bucket,
            object='export/{task}/block_date={datestamp}/{task}.{file_format}'.
            format(task=task, datestamp='{{ds}}', file_format=file_format),
            dag=dag)

        def load_task(ds, **kwargs):
            client = Client()
            job_config = LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            # Load from
            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            date_glob = '*'
            uri = '{export_location_uri}/{task}/{date_glob}.{file_format}'.format(
                export_location_uri=export_location_uri,
                task=task,
                date_glob=date_glob,
                file_format=file_format)
            logging.info('Load from uri: ' + uri)

            # Table name
            table_name = task
            logging.info('Table name: ' + table_name)
            table_ref = client.dataset(dataset_name_raw).table(table_name)

            # Load job
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'

        load_operator = PythonOperator(task_id='load_{task}'.format(task=task),
                                       python_callable=load_task,
                                       provide_context=True,
                                       execution_timeout=timedelta(minutes=30),
                                       dag=dag)

        wait_sensor >> load_operator
        return load_operator
Beispiel #3
0
def add_load_tasks(task, file_format, allow_quoted_newlines=False):
    output_bucket = os.environ.get('OUTPUT_BUCKET')
    if output_bucket is None:
        raise ValueError('You must set OUTPUT_BUCKET environment variable')

    wait_sensor = GoogleCloudStorageObjectSensor(
        task_id='wait_latest_{task}'.format(task=task),
        timeout=60 * 60,
        poke_interval=60,
        bucket=output_bucket,
        object='export/{task}/block_date={datestamp}/{task}.{file_format}'.
        format(task=task, datestamp='{{ds}}', file_format=file_format),
        dag=dag)

    def load_task():
        client = bigquery.Client()
        job_config = bigquery.LoadJobConfig()
        schema_path = os.path.join(
            dags_folder,
            'resources/stages/raw/schemas/{task}.json'.format(task=task))
        job_config.schema = read_bigquery_schema_from_file(schema_path)
        job_config.source_format = bigquery.SourceFormat.CSV if file_format == 'csv' else bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        if file_format == 'csv':
            job_config.skip_leading_rows = 1
        job_config.write_disposition = 'WRITE_TRUNCATE'
        job_config.allow_quoted_newlines = allow_quoted_newlines

        export_location_uri = 'gs://{bucket}/export'.format(
            bucket=output_bucket)
        uri = '{export_location_uri}/{task}/*.{file_format}'.format(
            export_location_uri=export_location_uri,
            task=task,
            file_format=file_format)
        table_ref = client.dataset(dataset_name_raw).table(task)
        load_job = client.load_table_from_uri(uri,
                                              table_ref,
                                              job_config=job_config)
        submit_bigquery_job(load_job, job_config)
        assert load_job.state == 'DONE'

    load_operator = PythonOperator(task_id='load_{task}'.format(task=task),
                                   python_callable=load_task,
                                   execution_timeout=timedelta(minutes=30),
                                   dag=dag)

    wait_sensor >> load_operator
    return load_operator
Beispiel #4
0
    def add_load_tasks(task, time_partitioning_field='timestamp'):
        wait_sensor = GoogleCloudStorageObjectSensor(
            task_id='wait_latest_{task}'.format(task=task),
            timeout=60 * 60,
            poke_interval=60,
            bucket=output_bucket,
            object='export/{task}/block_date={datestamp}/{task}.json'.format(
                task=task, datestamp='{{ds}}'),
            dag=dag)

        def load_task():
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/load/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.ignore_unknown_values = True
            job_config.time_partitioning = TimePartitioning(
                field=time_partitioning_field)

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.json'.format(
                export_location_uri=export_location_uri, task=task)
            table_ref = create_dataset(
                client, dataset_name,
                destination_dataset_project_id).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'

        load_operator = PythonOperator(task_id='load_{task}'.format(task=task),
                                       python_callable=load_task,
                                       execution_timeout=timedelta(minutes=30),
                                       dag=dag)

        wait_sensor >> load_operator
        return load_operator
Beispiel #5
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

# write dag
with DAG(dag_id='blog',
         default_args=default_args,
         schedule_interval=schedule_interval,
         catchup=False) as dag:

    t1 = BigQueryCheckOperator(task_id='check_bq_data_exists',
                               sql=queries.check_bq_data_exists,
                               use_legacy_sql=False)

    t2 = GoogleCloudStorageObjectSensor(task_id='check_gcs_file_exists',
                                        bucket=cfg.BUCKET,
                                        object=cfg.SOURCE_OBJECT)

    t3 = BigQueryOperator(task_id='write_weight_data_to_bq',
                          sql=queries.write_weight_data_to_bq,
                          destination_dataset_table=cfg.BQ_TABLE_WEIGHT,
                          create_disposition='CREATE_IF_NEEDED',
                          write_disposition='WRITE_TRUNCATE',
                          use_legacy_sql=False)

    t4 = GoogleCloudStorageToBigQueryOperator(
        task_id='write_route_data_to_bq',
        bucket=cfg.BUCKET,
        source_objects=[cfg.SOURCE_OBJECT],
        field_delimiter=';',
        destination_project_dataset_table=cfg.BQ_TABLE_ROUTE,
Beispiel #6
0
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=3),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
        'bdpc_lab8_orchestration',
        schedule_interval='@hourly',
        default_args=default_dag_args,
        catchup=False) as dag:

    # Sensor to check file in GCS -> gc://${bucket_name}/flights/${yyyy}/${MM}/${dd}/${HH}/_SUCCESS .
    gcs_file_sensor = GoogleCloudStorageObjectSensor(
        task_id='check_gcs_file_sensor',
        timeout=120,
        bucket='barinov_bdpc',
        soft_fail=True, # This will skip the DAG if file not found, but alternatively this could be set to False to fail the DAG.
        object='flights/{{ execution_date.format("%Y/%m/%d/%H") }}/_SUCCESS')

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='lab8-work-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run Spark job - Popular airports
    run_dataproc_spark_t1 = dataproc_operator.DataProcSparkOperator(
        task_id='run_dataproc_spark_task1',
    'owner': 'Airflow',
    #'start_date': airflow.utils.dates.days_ago(2),
    'start_date': datetime.datetime(2020, 1, 27),
}

dag = DAG(
    dag_id='exercise_gcp_dag',
    default_args=args,
    schedule_interval="0 0 * * *",
    dagrun_timeout=datetime.timedelta(minutes=60),
)

sensor = GoogleCloudStorageObjectSensor(
    task_id='sensor',
    bucket='test_bucket312312',
    timeout=3600,
    object='test/{{ ds_nodash }}.csv',
    dag=dag,
)

copy_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id='copy_to_bq',
    dag=dag,
    bucket='test_bucket312312',
    source_objects=['test/{{ ds_nodash }}.csv'],
    destination_project_dataset_table=
    'airflowbolcom-jan2829-2ad52563.test_dataset.test_table',
    skip_leading_rows=1,
    write_disposition='WRITE_TRUNCATE',
    autodetect=True,
)
Beispiel #8
0
    task_trigger_report_email = EmailOperator(
        task_id='send_report_email_' + rpt_folder_name,
        to=rpt_rcpnt,
        cc=rpt_pic,
        subject='Report ' + rpt_folder_name + "_" + "{{ ds }}",
        html_content=EmailTemplate,
        dag=dag)

    rpt_cnt = 0
    for itr_rpt in report_config[rpt_folder_cnt]['reports']:
        rpt_name = report_config[rpt_folder_cnt]['reports'][rpt_cnt][
            'report_name']

        task_gcs_sensor = GoogleCloudStorageObjectSensor(
            task_id='gcs_sensor_' + rpt_name,
            bucket=rpt_bucket_name,
            object='%s/%s_%s.txt' % (rpt_folder_name, "{{ ds }}", rpt_name),
            google_cloud_conn_id='gcp_project_deng',
            dag=dag)

        task_gcs_give_acl = GoogleCloudStorageObjectCreateAclEntryOperator(
            task_id='giveaclaccess_gcs_' + rpt_name,
            object_name='%s/%s_%s.txt' %
            (rpt_folder_name, "{{ ds }}", rpt_name),
            entity='allUsers',
            role='READER',
            bucket='rpt_bucket_name',
            google_cloud_storage_conn_id='gcp_project_deng',
            dag=dag)

        task_start >> task_start_report >> task_gcs_sensor >> task_gcs_give_acl
        rpt_cnt += 1
Beispiel #9
0
def build_sessions_dag(dag_id,
                       output_bucket,
                       sql_dir,
                       source_project_id,
                       source_dataset_name,
                       destination_project_id,
                       destination_dataset_name,
                       temp_dataset_name,
                       notification_emails=None,
                       schedule_interval='0 14 * * *',
                       start_date=datetime(2015, 7, 30),
                       environment='prod'):

    default_dag_args = {
        'depends_on_past': False,
        'start_date': start_date,
        'email_on_failure': True,
        'email_on_retry': False,
        'retries': 5,
        'retry_delay': timedelta(minutes=5)
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args['email'] = [
            email.strip() for email in notification_emails.split(',')
        ]

    # Define a DAG (directed acyclic graph) of tasks.
    dag = models.DAG(dag_id,
                     catchup=True,
                     schedule_interval=schedule_interval,
                     max_active_runs=1,
                     default_args=default_dag_args)

    def read_file(filepath):
        with open(filepath) as file_handle:
            content = file_handle.read()
            return content

    def add_sessions_task(task, dependencies=None):
        def sessions_task(ds, **kwargs):

            client = bigquery.Client()
            sql_path = os.path.join(sql_dir, '{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            ds_no_dashes = ds.replace('-', '')
            sql = sql_template.format(
                ds=ds,
                ds_no_dashes=ds_no_dashes,
                source_project_id=source_project_id,
                source_dataset_name=source_dataset_name,
                destination_project_id=destination_project_id,
                destination_dataset_name=destination_dataset_name,
                temp_dataset_name=temp_dataset_name,
            )
            print(sql)
            query_job = client.query(sql)
            result = query_job.result()
            logging.info(result)

        sessions_operator = PythonOperator(
            task_id=f'{task}',
            # Necessary because we use traces overlapping the previous execution date.
            depends_on_past=True,
            wait_for_downstream=True,
            python_callable=sessions_task,
            provide_context=True,
            execution_timeout=timedelta(minutes=60),
            dag=dag)

        if dependencies is not None and len(dependencies) > 0:
            for dependency in dependencies:
                dependency >> sessions_operator
        return sessions_operator

    stage_root_call_traces_task = add_sessions_task('root_call_traces')
    upsert_sessions_task = add_sessions_task('sessions')

    # Dummy task indicating successful DAG completion.
    done_task = BashOperator(task_id='done', bash_command='echo done', dag=dag)

    #
    # Task sensor is enabled only in production for now because our load DAG is
    # not running in the lower environments.
    #
    if environment == 'prod':
        wait_for_ethereum_load_dag_task = GoogleCloudStorageObjectSensor(
            task_id='wait_for_ethereum_load_dag',
            timeout=60 * 60 * 12,
            poke_interval=5 * 60,
            bucket=output_bucket,
            object=
            "checkpoint/block_date={block_date}/load_complete_checkpoint.txt".
            format(block_date='{{ds}}'),
            dag=dag)
        wait_for_ethereum_load_dag_task >> stage_root_call_traces_task

    stage_root_call_traces_task >> upsert_sessions_task
    upsert_sessions_task >> done_task

    return dag
Beispiel #10
0
    region='us-west1',
    job_name=cleaned_dag_id + 'calc_retention_day1',
    cluster_name=
    '{{ ti.xcom_pull(key=unique_cluster_name, task_ids="generate_unique_cluster_name") }}'
    + '2',
    execution_timeout=timedelta(hours=2),
    arguments=args,
    dag=dag)

# ==================
# == gcs sensors ===
# ==================

sensor_task = GoogleCloudStorageObjectSensor(task_id='sensor_task',
                                             bucket='exampleBucket',
                                             object='output/user/_SUCCESS',
                                             poke_interval=30,
                                             timeout=2700,
                                             dag=dag)

# =======================
# == load to bigquery ===
# =======================

bq_load_user = GoogleCloudStorageToBigQueryOperator(
    task_id='bq_load_user',
    bucket='exampleBucket',
    source_objects=["obj_folder/*"],
    source_format='NEWLINE_DELIMITED_JSON',
    destination_project_dataset_table='exampleSchema.exampleTable0',
    schema_fields=[
        {
Beispiel #11
0
    'project_id': var_project_id
}

# [START composer_hadoop_schedule]
with models.DAG(
        'airflow',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(hours=1),
        default_args=default_dag_args,
        catchup=False) as dag:
    # [END composer_hadoop_schedule]

    # Sensor
    gcs_file_sensor = GoogleCloudStorageObjectSensor(
        task_id='waiting_file_sensor',
        timeout=120,
        bucket=var_bucket_name,
        soft_fail=True,
        object='flights/{{ execution_date.format("%Y/%m/%d/%H") }}/_SUCCESS')

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name=naming_cluster + '-{{ ds_nodash }}',
        num_workers=2,
        zone=var_zone,
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
Beispiel #12
0
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
        'composer_spark_job',
        # Continue to run DAG once per day
        schedule_interval='0 *,* * * *',
        default_args=default_dag_args,
        catchup=False) as dag:

  
    gcs_file_sensor = GoogleCloudStorageObjectSensor(
        task_id='gcs_file_sensor_task',
        bucket=bucket,
        object=full_path,
        timeout=3600)

  # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

Beispiel #13
0
DEFAULT_ARGS = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 8, 18),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

DAG_ID = DAG(dag_id=DAG_NAME,
             default_args=DEFAULT_ARGS,
             schedule_interval='@daily')

SENSE = GoogleCloudStorageObjectSensor(
    task_id='get_file_sample',
    dag=DAG_ID,
    bucket=BUCKET,
    object=OBJECT,
    google_cloud_conn_id=GCS_CONN,
)

PRINT = BashOperator(task_id='print',
                     bash_command='echo "file arrived"',
                     params={'my_param': 'dummy param'},
                     dag=DAG_ID)

SENSE > PRINT
def build_load_worldpop_dag(
    dag_id,
    output_bucket,
    countries,
    large_countries,
    destination_dataset_project_id,
    destination_dataset_name,
    destination_table_name,
    staging_dataset_project_id,
    staging_dataset_name,
    dataflow_template_path,
    dataflow_environment,
    notification_emails=None,
    load_start_date=datetime(2000, 1, 1),
    load_schedule_interval="0 0 * * *",
    load_max_active_runs=None,
    load_concurrency=None,
    load_retries=5,
    load_retry_delay=300,
    output_path_prefix="export",
    **kwargs,
):
    if not output_bucket:
        raise ValueError("output_bucket is required")
    if not destination_dataset_project_id:
        raise ValueError("destination_dataset_project_id is required")
    if not destination_dataset_name:
        raise ValueError("destination_dataset_name is required")
    if not destination_table_name:
        raise ValueError("destination_table_name is required")

    default_dag_args = {
        "depends_on_past": False,
        "start_date": load_start_date,
        "end_date": None,
        "email_on_failure": True,
        "email_on_retry": False,
        "retries": load_retries,
        "retry_delay": timedelta(seconds=load_retry_delay),
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args["email"] = [
            email.strip() for email in notification_emails.split(",")
        ]

    if load_max_active_runs is None:
        load_max_active_runs = configuration.conf.getint(
            "core", "max_active_runs_per_dag")

    dag = models.DAG(
        dag_id,
        schedule_interval=load_schedule_interval,
        max_active_runs=load_max_active_runs,
        concurrency=load_concurrency,
        default_args=default_dag_args,
        is_paused_upon_creation=True,
    )

    dags_folder = os.environ.get("DAGS_FOLDER", "/home/airflow/gcs/dags")

    def read_bigquery_schema(schema):
        schema_path = os.path.join(
            dags_folder,
            "resources/stages/load/schemas/{schema}.json".format(
                schema=schema),
        )
        return read_bigquery_schema_from_file(schema_path)

    def load_task(country, **context):
        client = bigquery.Client()
        job_config = bigquery.LoadJobConfig()
        job_config.schema = read_bigquery_schema("world_pop")
        job_config.source_format = bigquery.SourceFormat.PARQUET
        job_config.write_disposition = "WRITE_TRUNCATE"
        job_config.ignore_unknown_values = True
        job_config.range_partitioning = RangePartitioning(
            field="year",
            range_=PartitionRange(start=1900, end=2100, interval=1),
        )

        execution_date = context["execution_date"]
        load_table_name = "{table}_{country}_{year}".format(
            table=destination_table_name,
            country=country,
            year=execution_date.strftime("%Y"),
        )
        table_ref = create_dataset(
            client,
            staging_dataset_name,
            project=staging_dataset_project_id,
        ).table(load_table_name)

        load_uri = "gs://{bucket}/{prefix}/world_pop/year={year}/parquet/{country}_{year}.parquet".format(
            bucket=output_bucket,
            prefix=output_path_prefix,
            country=country,
            year=execution_date.strftime("%Y"),
        )
        load_job = client.load_table_from_uri(
            load_uri,
            table_ref,
            job_config=job_config,
        )
        submit_bigquery_job(load_job, job_config)
        assert load_job.state == "DONE"

    def merge_task(country, **context):
        client = bigquery.Client()

        table_ref = create_dataset(
            client,
            destination_dataset_name,
            project=destination_dataset_project_id,
        ).table(destination_table_name)
        if not does_table_exist(client, table_ref):
            table = bigquery.Table(table_ref,
                                   schema=read_bigquery_schema("world_pop"))
            table.range_partitioning = RangePartitioning(
                field="year",
                range_=PartitionRange(start=1900, end=2100, interval=1),
            )
            table.clustering_fields = [
                "geography",
                "geography_polygon",
                "country",
            ]
            client.create_table(table)

        job_config = bigquery.QueryJobConfig()
        job_config.priority = bigquery.QueryPriority.INTERACTIVE

        sql_path = os.path.join(
            dags_folder, "resources/stages/load/sqls/merge_worldpop.sql")
        sql_template = read_file(sql_path)

        execution_date = context["execution_date"]
        year = execution_date.strftime("%Y")
        staging_table_name = "{table}_{country}_{year}".format(
            table=destination_table_name, country=country, year=year)

        template_context = {
            "year": year,
            "country": country,
            "source_table": staging_table_name,
            "source_project_id": staging_dataset_project_id,
            "source_dataset_name": staging_dataset_name,
            "destination_table": destination_table_name,
            "destination_dataset_project_id": destination_dataset_project_id,
            "destination_dataset_name": destination_dataset_name,
        }

        sql = context["task"].render_template(sql_template, template_context)
        job = client.query(sql, location="US", job_config=job_config)
        submit_bigquery_job(job, job_config)
        assert job.state == "DONE"

    priority = len(countries)
    for country in countries.split(","):
        c = country.lower()
        wait_uri = (
            "{prefix}/world_pop/year={year}/parquet/{country}_{year}.parquet".
            format(
                prefix=output_path_prefix,
                country=country,
                year='{{execution_date.strftime("%Y")}}',
            ))
        wait_gcs = GoogleCloudStorageObjectSensor(
            task_id=f"wait_{c}",
            timeout=60 * 60,
            poke_interval=60,
            bucket=output_bucket,
            object=wait_uri,
            weight_rule="upstream",
            priority_weight=priority,
            dag=dag,
        )
        if country in large_countries:
            input_file = "gs://{bucket}/{prefix}/world_pop/year={year}/parquet/{country}_{year}.parquet".format(
                bucket=output_bucket,
                prefix=output_path_prefix,
                country=country,
                year='{{ execution_date.strftime("%Y") }}',
            )
            output_table = "{table}_{country}_{year}".format(
                table=destination_table_name,
                country=country,
                year='{{ execution_date.strftime("%Y") }}',
            )
            load_operator = DataflowStartFlexTemplateOperator(
                task_id=f"run_dataflow_load_{c}",
                body={
                    "launchParameter": {
                        "containerSpecGcsPath":
                        f"{dataflow_template_path}/load-parquet-0.1.0.json",
                        "jobName": "load-parquet-{country}".format(country=c) +
                        '-{{ execution_date.strftime("%Y%m%d-%H%M%S") }}',
                        "parameters": {
                            "input-file":
                            input_file,
                            "output-table":
                            f"{staging_dataset_project_id}:{staging_dataset_name}.{output_table}",
                            "output-schema":
                            "/dataflow/template/schemas/world_pop.json",
                        },
                        "environment": dataflow_environment,
                    }
                },
                location="us-central1",
                wait_until_finished=True,
                dag=dag,
            )
        else:
            load_operator = PythonOperator(
                task_id=f"load_{c}",
                python_callable=load_task,
                execution_timeout=timedelta(minutes=600),
                provide_context=True,
                op_kwargs={"country": country},
                retries=1,
                retry_delay=timedelta(seconds=300),
                weight_rule="upstream",
                priority_weight=priority,
                dag=dag,
            )
        merge_operator = PythonOperator(
            task_id=f"merge_{c}",
            python_callable=merge_task,
            execution_timeout=timedelta(minutes=600),
            provide_context=True,
            op_kwargs={"country": country},
            retries=1,
            retry_delay=timedelta(seconds=300),
            weight_rule="upstream",
            priority_weight=priority,
            dag=dag,
        )
        priority -= 1
        wait_gcs >> load_operator >> merge_operator

    return dag
Beispiel #15
0
  bike_share_retention_d7 = DataProcSparkOperator(
      task_id='bike_share_retention_d7',
      dataproc_spark_jars=[
          'gs://jiuzhangsuanfa/jar/CohortProject-assembly-0.1.jar'
      ],
      main_class='com.cohort.process.RetentionProcess',
      region='us-west1',
      job_name=dag_name + 'bike_share_retention_d7',
      cluster_name='{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}'
      + '4',
      execution_timeout=timedelta(minutes=180),
      arguments=args)

  unique_user_sensor = GoogleCloudStorageObjectSensor(
      task_id='unique_user_sensor',
      bucket='jiuzhangsuanfa',
      object='bike/unique-user/_SUCCESS',
      poke_interval=30,
      timeout=2700)

  unique_user.set_upstream(dataproc_create_cluster_1)

  unique_user.set_downstream(bike_share_aggregator)

  bike_share_aggregator.set_downstream(dataproc_destroy_cluster_1)

  bike_share_retention_d1.set_upstream(dataproc_create_cluster_2)

  bike_share_retention_d1.set_downstream(dataproc_destroy_cluster_2)

  bike_share_retention_d3.set_upstream(dataproc_create_cluster_3)
    # print("initest : " + str(context.get("execution_date")))
    # print("initest : " + type(context.get("execution_date")))
    # print("initest : " + type(context.get("dag").schedule_interval))

    test = pendulum.datetime(2019, 2, 16, 10, 0, 0)
    print("initest Execution_date: " + str(context['execution_date']) +
          " Schedule_interval: " + str(context['dag'].schedule_interval))
    return context.get(
        "execution_date") + test  #context['dag'].schedule_interval


# def execute(self, context):
#     execution_date = context.get("execution_date")
t1 = GoogleCloudStorageObjectSensor(
    task_id='gcs_sensor_trigger',
    bucket='dataflow-results-ini',
    object='gs://dataflow-results-ini/airflow-tesing/test.csv',
    google_cloud_conn_id='gcs_conn',
    dag=dag)

t2 = GoogleCloudStorageToBigQueryOperator(
    task_id='gcs_to_bq',
    bucket='dataflow-results-ini',
    source_objects=['airflow-testing/test.csv'],
    schema_object='airflow-testing/test.json',
    destination_project_dataset_table="dataflowtesting-218212.testing.newone",
    write_disposition='WRITE_APPEND',
    autodetect=True,
    bigquery_conn_id=BQ_CONN_ID,
    dag=dag)

t1 >> t2
Beispiel #17
0
# --------------------------------------------------------------------------------

dag = DAG('DAG_3_GCS_To_BigQuery',
          default_args=default_args,
          schedule_interval='@daily')

start = DummyOperator(
    task_id='start',
    trigger_rule='all_success',
    dag=dag
)

GCS_landing_sensor = GoogleCloudStorageObjectSensor(
    task_id='GCS_landing_sensor',
    bucket=gcs_bucket,
    object=os.path.join(gcs_dir, success_file),
    google_cloud_conn_id=google_cloud_conn_id,
    dag=dag
)

GCS_to_BigQuery = GoogleCloudStorageToBigQueryOperator(
    task_id='GCS_to_BigQuery',
    destination_project_dataset_table=bq_destination_dataset_table,
    bucket=gcs_bucket,
    source_format="avro",
    source_objects=[os.path.join(gcs_dir, '*.avro')],
    create_disposition='CREATE_IF_NEEDED',
    # The following values are supported for `create_disposition`:
    # CREATE_IF_NEEDED: If the table does not exist, BigQuery creates the table.
    # CREATE_NEVER: The table must already exist. If it does not, a 'notFound' error is returned in the job result.
    # The default value is CREATE_IF_NEEDED.
Beispiel #18
0
        dataproc_spark_jars=['gs://path/jar/CohortAnalysis.jar'],
        main_class='com.makoto.spark.process.RetentionComputeProcess',
        region='us-west1',
        job_name=dag_name + 'bike_share_retention_d7',
        cluster_name=
        '{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}'
        + '4',
        execution_timeout=timedelta(minutes=180),
        arguments=args)

    # check for every 1 min = poke_interval if _SUCCESS, the status of computed unique user list json exist
    # timeout be 1 hour
    # if completed user list generation, trigger down stream jobs
    unique_user_sensor = GoogleCloudStorageObjectSensor(
        task_id='unique_user_sensor',
        bucket='bucketname',
        object='bike/unique-user/_SUCCESS',
        poke_interval=30,
        timeout=6000)

    unique_user.set_upstream(dataproc_create_cluster_1)

    unique_user.set_downstream(bike_share_aggregator)

    bike_share_aggregator.set_downstream(dataproc_destroy_cluster_1)

    bike_share_retention_d1.set_upstream(dataproc_create_cluster_2)

    bike_share_retention_d1.set_downstream(dataproc_destroy_cluster_2)

    bike_share_retention_d3.set_upstream(dataproc_create_cluster_3)
def build_load_npp_dag(dag_id,
                       output_bucket,
                       destination_dataset_project_id,
                       destination_dataset_name,
                       destination_table_name,
                       notification_emails=None,
                       load_start_date=datetime(2000, 1, 1),
                       load_schedule_interval=None,
                       load_max_active_runs=None,
                       load_concurrency=None,
                       load_retries=5,
                       load_retry_delay=300,
                       output_path_prefix="export",
                       **kwargs):
    if not output_bucket:
        raise ValueError("output_bucket is required")
    if not destination_dataset_project_id:
        raise ValueError("destination_dataset_project_id is required")
    if not destination_dataset_name:
        raise ValueError("destination_dataset_name is required")
    if not destination_table_name:
        raise ValueError("destination_table_name is required")

    default_dag_args = {
        "depends_on_past": False,
        "start_date": load_start_date,
        "end_date": None,
        "email_on_failure": True,
        "email_on_retry": False,
        "retries": load_retries,
        "retry_delay": timedelta(seconds=load_retry_delay),
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args["email"] = [
            email.strip() for email in notification_emails.split(",")
        ]

    if load_max_active_runs is None:
        load_max_active_runs = configuration.conf.getint(
            "core", "max_active_runs_per_dag")

    dag = models.DAG(
        dag_id,
        schedule_interval=load_schedule_interval,
        max_active_runs=load_max_active_runs,
        concurrency=load_concurrency,
        default_args=default_dag_args,
        is_paused_upon_creation=True,
    )

    def load_task(**context):
        dags_folder = os.environ.get("DAGS_FOLDER", "/home/airflow/gcs/dags")
        schema_path = os.path.join(
            dags_folder,
            "resources/stages/load/schemas/{schema}.json".format(
                schema="annual_npp"),
        )
        client = bigquery.Client()
        job_config = bigquery.LoadJobConfig()
        job_config.schema = read_bigquery_schema_from_file(schema_path)
        job_config.source_format = bigquery.SourceFormat.PARQUET
        job_config.write_disposition = "WRITE_TRUNCATE"
        job_config.ignore_unknown_values = True
        job_config.clustering_fields = [
            "geography",
            "geography_polygon",
        ]
        job_config.range_partitioning = RangePartitioning(
            field="year",
            range_=PartitionRange(start=1900, end=2100, interval=1),
        )
        execution_date = context["execution_date"]
        load_table_name = "{table}${partition}".format(
            table=destination_table_name,
            partition=execution_date.strftime("%Y"))
        table_ref = create_dataset(
            client,
            destination_dataset_name,
            project=destination_dataset_project_id,
        ).table(load_table_name)

        load_uri = "gs://{bucket}/{prefix}/annual_npp/parquet/{date}.parquet".format(
            bucket=output_bucket,
            prefix=output_path_prefix,
            date=execution_date.strftime("%Y_%m_%d"),
        )
        load_job = client.load_table_from_uri(
            load_uri,
            table_ref,
            job_config=job_config,
        )
        submit_bigquery_job(load_job, job_config)
        assert load_job.state == "DONE"

    wait_uri = "{prefix}/annual_npp/parquet/{date}.parquet".format(
        prefix=output_path_prefix,
        date='{{ execution_date.strftime("%Y_%m_%d") }}')
    wait_gcs = GoogleCloudStorageObjectSensor(
        task_id="wait_gcs",
        timeout=60 * 60,
        poke_interval=60,
        bucket=output_bucket,
        object=wait_uri,
        dag=dag,
    )
    load_operator = PythonOperator(
        task_id="load_to_bigquery",
        python_callable=load_task,
        execution_timeout=timedelta(minutes=600),
        provide_context=True,
        retries=1,
        retry_delay=timedelta(seconds=300),
        dag=dag,
    )
    wait_gcs >> load_operator
    return dag
file_suffix = "_file.csv"

file_date = today.strftime('%Y-%m-%d')
full_path_today = file_prefix + file_date + file_suffix

file_date_yesterday = yesterday.strftime('%Y-%m-%d')
full_path_yesterday = file_prefix + file_date_yesterday + file_suffix

with airflow.DAG("file_sensor_example",
                 default_args=default_args,
                 schedule_interval="@once") as dag:

    start_task = DummyOperator(task_id="start")
    stop_task = DummyOperator(task_id="stop")
    sensor_task = FileSensor(task_id="file_sensor_task",
                             poke_interval=30,
                             filepath="/tmp/")
    #we expect yesterday to exist
    gcs_file_sensor_yesterday = GoogleCloudStorageObjectSensor(
        task_id='gcs_file_sensor_yesterday_task',
        bucket='myBucketName',
        object=full_path_yesterday)
    #for this example we expect today not to exist, keep running until 120 timeout, checkout docs for more options like mode  and soft_fail
    gcs_file_sensor_today = GoogleCloudStorageObjectSensor(
        task_id='gcs_file_sensor_today_task',
        bucket='myBucketName',
        object=full_path_today,
        timeout=120)

start_task >> sensor_task >> gcs_file_sensor_yesterday >> gcs_file_sensor_today >> stop_task
    'start_date': datetime(2019, 6, 17),
    'email': ['you@your_org.org'],
    'email_on_failure': True,
    'email_on_retry': True,
    'provide_context': True,
    'owner': 'airflow',
    'depends_on_past': True
}

dag = DAG(
    dag_id='gcs_sensor',
    default_args=args,
    schedule_interval=None,
)

gcs_sensor = GoogleCloudStorageObjectSensor(
    task_id="sense",
    bucket='bucket_of_stuff',
    object='sense_for_me.txt',
    google_cloud_storage_conn_id='google_cloud_storage_default',
    soft_fail=True,
    poke_interval=5,
    timeout=15,
    mode="poke",
    dag=dag)

start = DummyOperator(task_id="start", dag=dag)

end = DummyOperator(task_id="end", dag=dag)

start >> gcs_sensor >> end
Beispiel #22
0
seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': seven_days_ago,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=30),
}

with DAG('v1_8_cloud_storage', schedule_interval=timedelta(days=1),
         default_args=default_args) as dag:
    sens_object_create = GoogleCloudStorageObjectSensor(
        task_id='sens_object_create',
        bucket='{{var.value.gcs_bucket}}',
        object='{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/{{ ds_nodash }}/part-000000000000.avro',
        google_cloud_conn_id='gcp_smoke'
    )

    sens_object_update = GoogleCloudStorageObjectUpdatedSensor(
        task_id='sens_object_update',
        bucket='{{var.value.gcs_bucket}}',
        object='{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/99999999/part-000000000000.avro',
        google_cloud_conn_id='gcp_smoke'
    )