def createDataflowOprTask(task_name, module, stage, startShard, endShard,
                          startDate, endDate, changesetType, bigTableName,
                          dagObj, df_max_worker):
    return DataFlowJavaOperator(
        task_id=task_name,
        jar=
        '{{ var.value.sst_jar_base_incremental }}{{ var.value.sst_jar_name }}',
        options={
            'project': '{{ var.value.gcp_project_id }}',
            'zone': '{{ var.value.gcp_df_zone }}',
            'stagingLocation': '{{ var.value.staging_location }}',
            'autoscalingAlgorithm': 'THROUGHPUT_BASED',
            'workerMachineType': '{{ var.value.sst_gcp_df_machine_type }}',
            'bigtableProjectId': '{{ var.value.sst_gcp_bt_project_id }}',
            'bigtableInstanceId': '{{ var.value.sst_gcp_bt_instance_id }}',
            'bigtableTableId': bigTableName,
            'numWorkers': df_startup_worker_node,
            'defaultWorkerLogLevel': 'ERROR',
            'workerLogLevelOverrides':
            '{"com.umusic.gcp.sst.speedlayer.data":"ERROR"}',
            'module': module,
            'stage': stage,
            'startShard': startShard,
            'endShard': endShard,
            'startDate': startDate,
            'endDate': endDate,
            'loadDate': sst_job_load_date,
            'productDate': sst_job_product_date,
            'changesetStartDate': sst_chngst_start_date,
            'changesetEndDate': sst_chngst_end_date,
            'changeset': changesetType,
            'maxNumWorkers': df_max_worker
        },
        dag=dagObj)
 def setUp(self):
     self.dataflow = DataFlowJavaOperator(
         task_id=TASK_ID,
         jar=JAR_FILE,
         job_class=JOB_CLASS,
         dataflow_default_options=DEFAULT_OPTIONS_JAVA,
         options=ADDITIONAL_OPTIONS,
         poll_sleep=POLL_SLEEP)
Ejemplo n.º 3
0
yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

default_args = {
    'dataflow_default_options': {
        'project': project,
        'zone': zone,
        'region': region,
        'stagingLocation': dataflow_staging_bucket
    }
}

with models.DAG('prod_word_count',
                schedule_interval=None,
                default_args=default_args) as dag:
    dataflow_execution = DataFlowJavaOperator(task_id='wordcount-run',
                                              jar=dataflow_jar_location,
                                              start_date=yesterday,
                                              options={
                                                  'autoscalingAlgorithm':
                                                  'THROUGHPUT_BASED',
                                                  'maxNumWorkers':
                                                  '3',
                                                  'inputFile':
                                                  input_bucket + '/input.txt',
                                                  'output':
                                                  output_bucket + '/' +
                                                  output_prefix
                                              })
        'project': PROJECT,
        'region': REGION,
        'stagingLocation': DATAFLOW_STAGING_BUCKET
    }
}

with models.DAG('wordcount_dag',
                start_date=YESTERDAY,
                schedule_interval=None,
                default_args=DEFAULT_ARGS) as dag:

    DATAFLOW_EXECUTION = DataFlowJavaOperator(
        task_id='wordcount-run',
        jar=DATAFLOW_JAR_LOCATION,
        options={
            'autoscalingAlgorithm': 'THROUGHPUT_BASED',
            'maxNumWorkers': '3',
            'inputFile': f'{INPUT_BUCKET}/input.txt',
            'output': f'{OUTPUT_BUCKET}/{OUTPUT_PREFIX}'
        })

    DOWNLOAD_EXPECTED = GoogleCloudStorageDownloadOperator(
        task_id='download_ref_string',
        bucket=REF_BUCKET,
        object='ref.txt',
        store_to_xcom_key='ref_str',
    )

    DOWNLOAD_RESULT_ONE = GoogleCloudStorageDownloadOperator(
        task_id=DOWNLOAD_TASK_PREFIX + '_1',
        bucket=OUTPUT_BUCKET_NAME,
Ejemplo n.º 5
0
         )

run_entl_stat_hist = DataFlowJavaOperator(
	dag=dag,
	task_id='test_df_job',
    gcp_conn_id=bq_connection_id,
	jar="gs://ebcidc-to-bq-testing-us-central/ebcdic/jar/ebcidctobq-1.0.jar",
	options={
		'binaryFile':"gs://ebcidc-to-bq-testing-us-central/ebcdic/ecdic_files/WRT/wrt_csv_out_eb", 
        'copyBook':"gs://ebcidc-to-bq-testing-us-central/ebcdic/copybooks/diff_cop_book/WRT.cob",
        'parserClass':"com.dm.parser.ebcdic.FixedWidthFileParser",
        'outSchemaJsonPath':"gs://ebcidc-to-bq-testing-us-central/ebcdic/json/bq_schema/wrt*.json",
        'dqCheckJson':"gs://ebcidc-to-bq-testing-us-central/ebcdic/json/dq/wrt_dq_config.json",
        'dataTransformationJson':"gs://ebcidc-to-bq-testing-us-central/ebcdic/json/datatrans/wrt_transform.json",
        'outputTable':"dmgcp-ingestion-poc:airflow_test.test_df_job_7",
        'outputTableWriteMethod':"write_truncate",
        'errorTable':"dmgcp-ingestion-poc:airflow_test.test_df_job_7",
        'errorTableWriteMethod':"write_truncate",
        'auditTable':"dmgcp-ingestion-poc:transient.test_ebcdic_audit",
        'batchId':"202004281500",
        'splitSize':"500 MB",
		'project': "dmgcp-ingestion-poc",
		'tempLocation': "gs://ebcidc-to-bq-testing-us-central/ebcdic/temp/",
		'region' : "us-central1",
		'numWorkers' : "2",
		'maxNumWorkers' : "20",
		'workerMachineType' : "n1-standard-4",
		'serviceAccount' : "*****@*****.**",
        'subnetwork':"https://www.googleapis.com/compute/v1/projects/dm-network-host-project/regions/us-central1/subnetworks/us-central1-network-foundation" 
	})
    
run_entl_stat_hist
Ejemplo n.º 6
0
        'zone': 'asia-south1-a',
        'stagingLocation': 'gs://sample-fixed',
    }
}

dag = DAG('my_sample', description='To test the airflow using various operators',
          schedule_interval='@daily',
          start_date=datetime(2019, 3, 13), catchup=False, default_args=default_args)
task1 = BashOperator(task_id="print_date_task", bash_command="date", dag=dag)

task2 = PythonOperator(task_id='python_hello_task', python_callable=hello_func, dag=dag)

task3 = DataFlowJavaOperator(
    task_id='dataflow_invoke_task',
    gcp_conn_id='google_cloud_default',
    jar='gs://dataflow-java-demo/jars/Main.jar',
    options={
        'autoscalingAlgorithm': 'BASIC',
        'maxNumWorkers': '50',
        'start': '{{ds}}',
        'partitionType': 'DAY'  

    },
    dag=dag)

task4 = MyOperator(my_operator_param='This is my operator.',
                                task_id='my_toperator_task', dag=dag)
task1 >> task2 
task2 >> task3
task3 >> task4
    'start_date': datetime(2019, 9, 01),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'dataflow_default_options': {
        'project':
        'gcpProject',
        'zone':
        'gpcNetworkZone',
        'stagingLocation':
        'gs://dataPipelineFromElasticsearchToBigquery/airflowStaging/'
    }
}

dag = DAG('dataPipelineFromElasticsearchToBigquery-dag',
          default_args=dag_args,
          catchup=False)

start = DummyOperator(task_id='start', dag=dag)

task = DataFlowJavaOperator(
    task_id='daily-dataPipelineFromElasticsearchToBigquery-task',
    jar=
    'gs://dataPipelineFromElasticsearchToBigquery/lib/dataPipelineFromElasticsearchToBigquery.jar',
    options=all_options,
    dag=dag)

start >> task
Ejemplo n.º 8
0
        "project": "diego-palma",
        "stagingLocation": "gs://prueba-option/temp/dataflow/staging/",
        "tempLocation": "gs://prueba-option/temp/",
    },
}

with airflow.DAG(
        "prueba_tecnica",
        "catchup=False",
        default_args=default_args,
        template_searchpath=["/home/airflow/gcs/dags/"],
        schedule_interval=datetime.timedelta(days=1),
) as dag:

    dataflow_op = DataFlowJavaOperator(
        task_id="dataflow_task",
        jar="gs://prueba-option/dataflow-etl/prueba-tecnica-1.jar",
    )

    vuelos_to_bq = GoogleCloudStorageToBigQueryOperator(
        task_id="gcs_vuelos_to_bq",
        bucket="prueba-option",
        source_objects=["output/vuelos/*"],
        destination_project_dataset_table="diego-palma.prueba_option.vuelos",
        source_format="NEWLINE_DELIMITED_JSON",
        write_disposition="WRITE_TRUNCATE",
        dag=dag,
    )

    pasajeros_to_bq = GoogleCloudStorageToBigQueryOperator(
        task_id="gcs_pasajeros_to_bq",
        bucket="prueba-option",
Ejemplo n.º 9
0
from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
from airflow.operators.bash_operator import BashOperator
from airflow import DAG
from datetime import datetime, timedelta

dataflow_dag = DAG(dag_id="dataflow_pipeline",
                   start_date=datetime(2017, 2, 2),
                   schedule_interval=timedelta(seconds=15),
                   max_active_runs=1,
                   catchup=True)

print_path_task = BashOperator(dag=dataflow_dag,
                               bash_command="pwd",
                               task_id="test_upstream_task_pwd")

jar_task = DataFlowJavaOperator(dag=dataflow_dag,
                                jar="/home/airflow/gcs/dags/"
                                "jar/dataflow_pipeline-bundled-1.0.jar",
                                options={
                                    "project":
                                    "hybrid-elysium-118418",
                                    "stagingLocation":
                                    "gs://hybrid-elysium-118418/dataflow/"
                                },
                                task_id="dataflow_pipeline")

print_path_task.set_downstream(jar_task)
Ejemplo n.º 10
0
from airflow import models
from airflow.operators import python_operator
from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator

from google.cloud import bigquery
import datetime

yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

default_dag_args = {'start_date': yesterday}

with models.DAG('composer-dataflow-java',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    dataflow_java_task = DataFlowJavaOperator(
        jar=
        '/home/airflow/gcs/data/google-cloud-teleport-java-0.1-SNAPSHOT.jar',
        task_id='start-pipeline',
        options={
            'project': '<PROJECT_ID>',
            'tempLocation': '<GCS_BUCKET>',
            'inputTopic': '<INPUT_TOPIC>',
            'outputTableSpec': '<output_table>'
        })

    dataflow_java_task