Example #1
0
default_dag_args = {
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    #    'retry_delay': datetime.timedelta(minutes=5),
    'start_date': datetime.datetime.today() - datetime.timedelta(days=1)
}

with models.DAG('lastfm-1k-ingest',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    dataflow = dataflow_operator.DataFlowPythonOperator(
        task_id='ingest-users-dataflow',
        py_file='gs://{}/lastfm-dataset-1K/code/ingest-users.py'.format(
            PROJECT),
        job_name='ingest-users-dataflow',
        py_options=[],
        dataflow_default_options={
            'project': PROJECT,
            'region': 'europe-west1'
        },
        options={},
        poll_sleep=30)

    start = bash_operator.BashOperator(task_id='start',
                                       bash_command='echo "Start"')
    end = bash_operator.BashOperator(task_id='end', bash_command='echo "End"')

    start >> dataflow >> end
    # Args required for the Dataflow job.
    job_args = {
        'input': INPUT_BUCKET_CSV,

        # TODO: Populate the models.Variable.get() with the variable name for BQ table
        'output': models.Variable.get('bq_output_table'),

        # TODO: Populate the models.Variable.get() with the variable name for input field names
        'fields': models.Variable.get('input_field_names'),
        'load_dt': DS_TAG
    }

    # Main Dataflow task that will process and load the input delimited file.
    # TODO: Specify the type of operator we need to call to invoke DataFlow
    dataflow_task = dataflow_operator.DataFlowPythonOperator(
        task_id="process-delimited-and-push",
        py_file=DATAFLOW_FILE,
        options=job_args)

    # Here we create two conditional tasks, one of which will be executed
    # based on whether the dataflow_task was a success or a failure.
    success_move_task = python_operator.PythonOperator(
        task_id='success-move-to-completion',
        python_callable=move_to_completion_bucket,
        # A success_tag is used to move
        # the input file to a success
        # prefixed folder.
        op_args=[COMPLETION_BUCKET, SUCCESS_TAG],
        provide_context=True,
        trigger_rule=TriggerRule.ALL_SUCCESS)

    failure_move_task = python_operator.PythonOperator(
Example #3
0
                default_args=DEFAULT_DAG_ARGS) as dag:
    # Args required for the Dataflow job.
    job_args = {
        'input':
        'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"] }}',
        'output': 'compose-test-291802:lake.SALES_DATA',
        'runner': 'DataflowRunner',
        'project': 'compose-test-291802',
        'job_name': 'job-name-001',
        'temp_location': 'gs://sanch-test-bucket12/tmp/',
        'staging_location': 'gs://sanch-test-bucket12/stg',
        'load_dt': DS_TAG
    }

    # Main Dataflow task that will process and load the input delimited file.
    dataflow_task = dataflow_operator.DataFlowPythonOperator(
        task_id="dataflowstoragebq1", py_file=DATAFLOW_FILE, options=job_args)

    # Here we create two conditional tasks, one of which will be executed
    # based on whether the dataflow_task was a success or a failure.
    success_move_task = python_operator.PythonOperator(
        task_id='success-move-to-completion',
        python_callable=move_to_completion_bucket,
        # A success_tag is used to move
        # the input file to a success
        # prefixed folder.
        op_args=[COMPLETION_BUCKET, SUCCESS_TAG],
        provide_context=True,
        trigger_rule=TriggerRule.ALL_SUCCESS)

    failure_move_task = python_operator.PythonOperator(
        task_id='failure-move-to-completion',
    job_args = {
        'input': 'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"]}}',
        'output_raw':
        'RAW_{{dag_run.conf["name"][:dag_run.conf["name"].rfind("/")]}}.{{"_".join(dag_run.conf["name"][:dag_run.conf["name"].rfind(".")].split("_")[2:])}}',  # takes out the file name removing ".csv" from it
        'output_err':
        'PRZ_{{dag_run.conf["name"][:dag_run.conf["name"].rfind("/")]}}.{{"_".join(dag_run.conf["name"][:dag_run.conf["name"].rfind(".")].split("_")[2:])_ERR}}',
        'output_prz':
        'PRZ_{{dag_run.conf["name"][:dag_run.conf["name"].rfind("/")]}}.{{"_".join(dag_run.conf["name"][:dag_run.conf["name"].rfind(".")].split("_")[2:])}}',
        'fields': g_fields,
        'load_dt': '{{ dag_run.conf["bqTimestamp"]}}',
        'op_dict': g_operations_dict
    }

    # Main Dataflow task
    TSK_dataflow_file_ingestion = dataflow_operator.DataFlowPythonOperator(
        task_id="tsk-dataflow-file-ingestion",
        py_file=DATAFLOW_FILE,
        options=job_args)

    # Upon Dataflow task success the TSK_move_into_arc_bucket starts
    TSK_move_into_arc_bucket = python_operator.PythonOperator(
        task_id='TSK_move_into_arc_bucket',
        python_callable=DPLF_move_into_arc_bucket,
        op_args=[g_output_bucket],
        provide_context=True,
        trigger_rule=TriggerRule.ALL_SUCCESS)

    # Upon Dataflow task failure the TSK_move_into_inv_bucket starts
    TSK_move_into_inv_bucket = python_operator.PythonOperator(
        task_id='TSK_move_into_inv_bucket',
        python_callable=DPLF_move_into_inv_bucket,
        op_args=[g_failed_bucket],
Example #5
0
def create_dag(env_variables):
    """Creates the Airflow directed acyclic graph.

  Args:
    env_variables: Dictionary of Airflow environment variables.

  Returns:
    driblet_dag: An instance of models.DAG.
  """
    driblet_dag = initialize_dag()

    # Clients setup.
    project_id = env_variables['project_id']
    bq_client = bigquery.Client(project=project_id)
    gcs_client = storage.Client(project=project_id)

    # TASK 1: Convert BigQuery CSV to TFRECORD.
    dag_dir = configuration.get('core', 'dags_folder')
    transformer_py = os.path.join(dag_dir, 'tasks/preprocess',
                                  'transformer.py')
    bq_to_tfrecord = dataflow_operator.DataFlowPythonOperator(
        task_id='bq-to-tfrecord',
        py_file=transformer_py,
        options={
            'project':
            project_id,
            'predict-data':
            '{}.{}.{}_{}'.format(project_id, env_variables['bq_dataset'],
                                 env_variables['bq_input_table'],
                                 datetime.datetime.now().strftime('%Y%m%d')),
            'data-source':
            'bigquery',
            'transform-dir':
            'gs://%s/transformer' % env_variables['bucket_name'],
            'output-dir':
            'gs://%s/input' % env_variables['bucket_name'],
            'mode':
            'predict'
        },
        dataflow_default_options={'project': project_id},
        dag=driblet_dag)

    # TASK 2: Make prediction from CSV in GCS.
    make_predictions = mlengine_operator.MLEngineBatchPredictionOperator(
        task_id='make-predictions',
        project_id=project_id,
        job_id='driblet_run_{}'.format(
            datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')),
        data_format='TF_RECORD',
        input_paths=['gs://%s/input/predict-*' % env_variables['bucket_name']],
        output_path='gs://%s/output' % env_variables['bucket_name'],
        region=env_variables['region'],
        model_name=env_variables['model_name'],
        version_name=env_variables['model_version'],
        gcp_conn_id='google_cloud_default',
        dag=driblet_dag)

    # TASK 3: Export predicted CSV from Cloud Storage to BigQuery.
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    job_config.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.
        DAY,  # Sets daily partitioned table.
        expiration_ms=env_variables['dataset_expiration'])
    gcs_to_bigquery = GCStoBQOperator(
        task_id='gcs-to-bigquery',
        bq_client=bq_client,
        gcs_client=gcs_client,
        job_config=job_config,
        dataset_id=env_variables['bq_dataset'],
        table_id=env_variables['bq_output_table'],
        gcs_bucket=env_variables['bucket_name'],
        gcs_location=env_variables['location'],
        exclude_prefix='errors_stats',  # Exclude files starting with name.
        dir_prefix='output',
        dag=driblet_dag)

    # TASK 4: Delete files in Cloud Storage bucket.
    gcs_delete_blob = GCSDeleteBlobOperator(
        task_id='gcs-delete-blob',
        client=gcs_client,
        gcs_bucket=env_variables['bucket_name'],
        prefixes=['input', 'output'],
        dag=driblet_dag)

    make_predictions.set_upstream(bq_to_tfrecord)
    make_predictions.set_downstream(gcs_to_bigquery)
    gcs_delete_blob.set_upstream(gcs_to_bigquery)

    return driblet_dag
# Setting schedule_interval to None as this DAG is externally trigger by a Cloud Function.
with models.DAG(dag_id='mssql_gcs_dataflow_bigquery_dag_2',
                description='A DAG triggered by an external Cloud Function',
                schedule_interval=None,
                default_args=DEFAULT_DAG_ARGS) as dag:
    # Args required for the Dataflow job.
    job_args = {
        'input':
        'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"] }}',
        'output': models.Variable.get('bq_output_table'),
        'load_dt': DS_TAG
    }

    # Main Dataflow task that will process and load the input delimited file.
    dataflow_task = dataflow_operator.DataFlowPythonOperator(
        task_id="process-json-to-dataflow",
        py_file=DATAFLOW_FILE,
        options=job_args)

    # Here we create two conditional tasks, one of which will be executed
    # based on whether the export_sales_orders was a success or a failure.
    success_move_task = email_operator.EmailOperator(
        task_id='success',
        trigger_rule=TriggerRule.ALL_SUCCESS,
        to=models.Variable.get('email'),
        subject=
        'mssql_gcs_dataflow_bigquery_dag_2 Job Succeeded: start_date {{ ds }}',
        html_content="HTML CONTENT")

    failure_move_task = email_operator.EmailOperator(
        task_id='failure',
        trigger_rule=TriggerRule.ALL_FAILED,
Example #7
0
    return df.values[-1][0], df.values[-1][1]


def tell_slack(context):
    o = slack_webhook_operator.SlackWebhookOperator(task_id="tell_slack", http_conn_id='slack_default',
                                                    message="Number one page today is %s (%s hits)" % (
                                                        find_number_one()))
    return o.execute(context)


with models.DAG(
        'ga_daily_reporter',
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    benchmark_tally = dataflow_operator.DataFlowPythonOperator(task_id='benchmark_tally',
                                                               py_file='/home/airflow/gcs/dags/pipelines/benchmark_tally.py')
    combine_tally = python_operator.PythonOperator(
        task_id='combine_tally',
        python_callable=combine_tally,
        on_success_callback=tell_slack)
    # on_success_callback is a hack to delay generating the slack message
    # https://stackoverflow.com/questions/52054427/how-to-integrate-apache-airflow-with-slack
    tell_slack = slack_webhook_operator.SlackWebhookOperator(task_id="tell_slack", http_conn_id='slack_default',
                                                             message="A new report is out: "
                                                                     "https://%s/data/tally_69211100_20190425.csv" % (
                                                                         models.Variable.get('AIRFLOW_BUCKET',
                                                                                             'us-east1-dta-airflow-b3415db4-bucket')))

    generate_graph = python_operator.PythonOperator(
        task_id='generate_graph',
        python_callable=generate_graph)
    'email_on_retry': False
}

dag = DAG(
    'twitter_search',
    default_args=default_args,
    description='Load data from GCS to BQ Serving Layer',
    schedule_interval='@daily',
    dagrun_timeout=timedelta(minutes=30)
)

load_raw_data = dataflow_operator.DataFlowPythonOperator(
    task_id='load_raw_data',
    dag=dag,
    py_file='/home/airflow/gcs/dags/dataflow/twitter-google-dataflow.py',
    #py_file='dataflow/twitter-google-dataflow.py',
    job_name='twitter-google-dataflow-{{ ds }}',
    dataflow_default_options={'project':os.environ.get('GCP_PROJECT'), 'region': 'europe-west1','zone':'europe-west6-a','runner':'DataflowRunner'},
    options={'job_date':'{{ ds }}', 'twitter_bucket':os.environ.get('TWITTER_BUCKET'), 'dataflow_bucket':os.environ.get('DATAFLOW_BUCKET')}
)

#delete_sl_partition = bigquery_operator.BigQueryOperator( # TODO change to bq command line
#    task_id='delete_sl_partition',
#    dag=dag,
#    sql='''DELETE FROM dataops_demo_sl_dev.t_twitter_google WHERE c_created = '{{ ds }}' ''',
#    use_legacy_sql=False
#)

delete_sl_partition = bash_operator.BashOperator(
    task_id='delete_sl_partition',
    dag=dag,
    job_args = {
        'input':
        'gs://{{ dag_run.conf["bucket"] }}/{{ dag_run.conf["name"] }}',
        'output':
        'seu-projeto-nome-no-google:dataNavigationDataSet.RAW_DATA_NAVIGATION',
        'runner': 'DataflowRunner',
        'project': 'seu-projeto-nome-no-google',
        'job_name': 'job-name-001',
        'temp_location':
        'gs://seu-projeto-nome-no-google-bucket-navigation/tmp/',
        'load_dt': DS_TAG
    }

    # Main Dataflow task that will process and load the input delimited file.
    dataflow_task = dataflow_operator.DataFlowPythonOperator(
        task_id="dataflow-b2w-raw-nav-data-001",
        py_file=DATAFLOW_FILE,
        options=job_args)

    # Here we create two conditional tasks, one of which will be executed
    # based on whether the dataflow_task was a success or a failure.
    success_move_task = python_operator.PythonOperator(
        task_id='success-move-to-completion',
        python_callable=move_to_completion_bucket,
        # A success_tag is used to move
        # the input file to a success
        # prefixed folder.
        op_args=[COMPLETION_BUCKET, SUCCESS_TAG],
        provide_context=True,
        trigger_rule=TriggerRule.ALL_SUCCESS)

    failure_move_task = python_operator.PythonOperator(