def test_render_template(self): # Given operator = SparkSubmitOperator(task_id='spark_submit_job', dag=self.dag, **self._config) ti = TaskInstance(operator, DEFAULT_DATE) # When ti.render_templates() # Then expected_application_args = [ '-f', 'foo', '--bar', 'bar', '--start', (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"), '--end', DEFAULT_DATE.strftime("%Y-%m-%d"), '--with-spaces', 'args should keep embdedded spaces', ] expected_name = 'spark_submit_job' self.assertListEqual(expected_application_args, getattr(operator, '_application_args')) self.assertEqual(expected_name, getattr(operator, '_name'))
def cassandra_to_avro(): # @task def load_from_cassandra() -> List[Tuple[str, str]]: conn: Connection = Connection.get_connection_from_secrets('local_cassandra') auth_provider = PlainTextAuthProvider(username=conn.login, password=conn.password) cluster: Cluster = Cluster([conn.host], conn.port, auth_provider=auth_provider) session: Session = cluster.connect(conn.schema) rows: ResultSet = session.execute("SELECT title, description FROM videos") result = list(map(lambda row: (row[0], row[1]), rows)) print(result) return result # @task def write_to_hdfs(rows: List[Tuple[str, str]]): conn: Connection = Connection.get_connection_from_secrets('local_hdfs') uri = conn.get_uri() pat = re.compile("http://(\w+(:\w+)?)?@") print(conn.get_uri()) uri = pat.sub("http://", uri) print(uri) print(conn.login) client = InsecureClient(uri, user=conn.login) sch = avro.schema.make_avsc_object({ 'type':'record', 'name':'Video', 'fields': [ {'type': {'type': 'string', 'avro.java.string': 'String'}, 'name': 'title'}, {'type': ["null", {'type': 'string', 'avro.java.string': 'String'}], 'name': 'description'}, ] }) local_file_name = 'videos.avro' writer = DataFileWriter(open(local_file_name, "wb"), DatumWriter(), sch) for row in rows: print(row) writer.append({"title":row[0], "description":row[1]}) writer.close() client.upload('/tmp/videos.avro', local_file_name) load_and_save_using_spark = SparkSubmitOperator( task_id="cassandra_to_avro_spark", conn_id="spark_local", name="cassandra_to_avro_spark", application="dags/cassandra_to_avro_spark.py", packages="org.apache.spark:spark-avro_2.12:3.1.1,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0", ) # ctx = get_current_context() table_sensor = CassandraTableSensor( task_id="cassandra_table_sensor", cassandra_conn_id='local_cassandra', table="killrvideo.videos", ) # load = load_from_cassandra() # write_to_hdfs(load) table_sensor >> load_and_save_using_spark
def covid_per_popgroup_subdag(parent_dag_id, child_dag_id, args): with DAG( dag_id=f'{parent_dag_id}.{child_dag_id}', default_args=args, #start_date= datetime(2021,5,2), #days_ago(2), #datetime.datetime.now(), #days_ago(2), #schedule_interval = '@once', #tags=['covid'], ) as dag: last_date_popgroup_task = get_last_date_popgroup( task_id="last_date_popgroup_task") current_dir = os.path.dirname(os.path.abspath(__file__)) root_dir = os.path.dirname(current_dir) download_recent_cdc_task = SparkSubmitOperator( task_id="download_recent_cdc_task", conn_id="spark_default", application=os.path.join(root_dir, "python", "stage_recent_cdc.py"), application_args=[ "--apptoken", Variable.get("socrata_apptoken"), "--last_date", "{{ti.xcom_pull( task_ids = 'last_date_popgroup_task', key = 'last_cdc_date')}}", ], ) insert_covid_pergroup_task = PostgresOperator( task_id="insert_covid_pergroup_task", postgres_conn_id="postgres_default", sql=""" INSERT INTO covid_per_popgroup(cdc_case_earliest_dt, sex_id, age_group_id, race_ethnicity_id, count) SELECT cdc_case_earliest_dt, sex_id, age_group_id, race_ethnicity_id, count FROM recent_cdc AS n JOIN dim_age_group AS a ON a.age_group = n.age_group JOIN dim_sex AS s ON s.sex = n.sex JOIN dim_race_ethnicity AS e ON e.race = n.race_ethnicity_combined ; """) last_date_popgroup_task >> download_recent_cdc_task >> insert_covid_pergroup_task return dag
""" from airflow.models import DAG from airflow.providers.apache.spark.operators.spark_jdbc import SparkJDBCOperator from airflow.providers.apache.spark.operators.spark_sql import SparkSqlOperator from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator from airflow.utils.dates import days_ago args = {'owner': 'Airflow', 'start_date': days_ago(2)} with DAG(dag_id='example_spark_operator', default_args=args, schedule_interval=None, tags=['example']) as dag: # [START howto_operator_spark_submit] submit_job = SparkSubmitOperator( application="${SPARK_HOME}/examples/src/main/python/pi.py", task_id="submit_job") # [END howto_operator_spark_submit] # [START howto_operator_spark_jdbc] jdbc_to_spark_job = SparkJDBCOperator( cmd_type='jdbc_to_spark', jdbc_table="foo", spark_conf={}, spark_jars="${SPARK_HOME}/jars/postgresql-42.2.12.jar", jdbc_driver="org.postgresql.Driver", metastore_table="bar", save_mode="overwrite", save_format="JSON", task_id="jdbc_to_spark_job")
def test_execute(self): # Given / When conn_id = 'spark_default' operator = SparkSubmitOperator(task_id='spark_submit_job', spark_binary="sparky", dag=self.dag, **self._config) # Then expected results expected_dict = { 'conf': { 'parquet.compression': 'SNAPPY' }, 'files': 'hive-site.xml', 'py_files': 'sample_library.py', 'archives': 'sample_archive.zip#SAMPLE', 'driver_class_path': 'parquet.jar', 'jars': 'parquet.jar', 'packages': 'com.databricks:spark-avro_2.11:3.2.0', 'exclude_packages': 'org.bad.dependency:1.0.0', 'repositories': 'http://myrepo.org', 'total_executor_cores': 4, 'executor_cores': 4, 'executor_memory': '22g', 'keytab': 'privileged_user.keytab', 'principal': 'user/[email protected]', 'proxy_user': '******', 'name': '{{ task_instance.task_id }}', 'num_executors': 10, 'status_poll_interval': 30, 'verbose': True, 'application': 'test_application.py', 'driver_memory': '3g', 'java_class': 'com.foo.bar.AppMain', 'application_args': [ '-f', 'foo', '--bar', 'bar', '--start', '{{ macros.ds_add(ds, -1)}}', '--end', '{{ ds }}', '--with-spaces', 'args should keep embdedded spaces', ], 'spark_binary': 'sparky', } self.assertEqual(conn_id, operator._conn_id) self.assertEqual(expected_dict['application'], operator._application) self.assertEqual(expected_dict['conf'], operator._conf) self.assertEqual(expected_dict['files'], operator._files) self.assertEqual(expected_dict['py_files'], operator._py_files) self.assertEqual(expected_dict['archives'], operator._archives) self.assertEqual(expected_dict['driver_class_path'], operator._driver_class_path) self.assertEqual(expected_dict['jars'], operator._jars) self.assertEqual(expected_dict['packages'], operator._packages) self.assertEqual(expected_dict['exclude_packages'], operator._exclude_packages) self.assertEqual(expected_dict['repositories'], operator._repositories) self.assertEqual(expected_dict['total_executor_cores'], operator._total_executor_cores) self.assertEqual(expected_dict['executor_cores'], operator._executor_cores) self.assertEqual(expected_dict['executor_memory'], operator._executor_memory) self.assertEqual(expected_dict['keytab'], operator._keytab) self.assertEqual(expected_dict['principal'], operator._principal) self.assertEqual(expected_dict['proxy_user'], operator._proxy_user) self.assertEqual(expected_dict['name'], operator._name) self.assertEqual(expected_dict['num_executors'], operator._num_executors) self.assertEqual(expected_dict['status_poll_interval'], operator._status_poll_interval) self.assertEqual(expected_dict['verbose'], operator._verbose) self.assertEqual(expected_dict['java_class'], operator._java_class) self.assertEqual(expected_dict['driver_memory'], operator._driver_memory) self.assertEqual(expected_dict['application_args'], operator._application_args) self.assertEqual(expected_dict['spark_binary'], operator._spark_binary)
) COMMENT 'Main Table' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' TBLPROPERTIES ("skip.header.line.count"="1"); """) populate_hive_table = HiveOperator(task_id='populate_hive_table', hive_cli_conn_id='hive_conn', hql=""" LOAD DATA INPATH '/covidData/owid-covid-data.csv' INTO TABLE cov_data """) processing = SparkSubmitOperator( task_id='processing', application="/opt/airflow/dags/scripts/spark_processing.py", conn_id='spark_conn', verbose=True) copy_to_files = BashOperator(task_id='copy_to_files', bash_command=""" hdfs dfs -get -f /IndiaCOVID /opt/airflow/dags/files """) task_uploads3 = PythonOperator(task_id='task_uploads3', python_callable=uploads3, op_kwargs={ 'filename': '/opt/airflow/dags/files/IndiaCOVID', 'key': 'f', 'bucket_name': 'c19-backups-airflow'
nzd DOUBLE, gbp DOUBLE, jpy DOUBLE, cad DOUBLE ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE """ ) forex_processing = SparkSubmitOperator( task_id='forex_processing', conn_id='spark_conn', application="/User/arunraja/airflow/dags/scripts/forex_processing.py", verbose=False, executor_cores=2, num_executors=2, executor_memory='256M', driver_memory='1G' ) sending_email_notification = EmailOperator( task_id="sending_email", to="*****@*****.**", subject="forex_data_pipeline", html_content=""" <h3>forex_data_pipeline succeeded</h3> """ )
"applications.sink.prometheusServlet.path": "/metrics/applications/prometheus", "spark.kubernetes.executor.label.metrics-exposed": "true", "spark.kubernetes.driver.label.metrics-exposed": "true" } with DAG(dag_id="ddt-ingestion", schedule_interval="@hourly", default_args=default_args, catchup=False) as dag: stage_1 = SparkSubmitOperator( task_id="stage1", application="/opt/airflow/dags/repo/from_kafka_to_minio_streaming.py", conn_id="k8s_cluster", name="stage1", packages= "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,org.apache.kafka:kafka-clients:2.7.0,org.apache.hadoop:hadoop-aws:3.1.2,org.elasticsearch:elasticsearch-spark-30_2.12:7.13.1", conf=spark_conf, verbose=False) stage_2 = SparkSubmitOperator( task_id="stage2", application="/opt/airflow/dags/repo/parquet_enrichment.py", conn_id="k8s_cluster", name="stage2", packages= "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,org.apache.kafka:kafka-clients:2.7.0,org.apache.hadoop:hadoop-aws:3.1.2,org.elasticsearch:elasticsearch-spark-30_2.12:7.13.1", conf=spark_conf, verbose=False) stage_1 >> stage_2
from airflow.utils.dates import days_ago from airflow.utils.trigger_rule import TriggerRule dag = DAG( dag_id='datalake_dag', start_date=days_ago(2), schedule_interval='@once' ) ingest_country_info_task = SparkSubmitOperator( application="/jars/spark-task-1.0-SNAPSHOT-all.jar", task_id="ingest_country_info_task", conn_id="spark_cluster", dag=dag, java_class="com.genestack.tasks.DeltaLakeFileIngestorKt", conf={"spark.standalone.submit.waitAppCompletion": "true"}, application_args=[ "s3a://genestack-spark-test/country-info.json", # input file "json", # input file format "country-info" # output file ] ) ingest_covid_deaths_task = SparkSubmitOperator( application="/jars/spark-task-1.0-SNAPSHOT-all.jar", task_id="ingest_covid_deaths_task", conn_id="spark_cluster", dag=dag, java_class="com.genestack.tasks.DeltaLakeFileIngestorKt", conf={"spark.standalone.submit.waitAppCompletion": "true"}, application_args=[
schedule_interval='0 0 * * *', start_date=days_ago(2), dagrun_timeout=timedelta(minutes=60), tags=['spark'], params={"param": "value"}, ) run_this_last = DummyOperator( task_id='run_this_last', dag=dag, ) # TODO - Here we are running a pi.py script. Change path to youre location. flight_search_ingestion = SparkSubmitOperator( task_id='flight_search_ingestion', conn_id='spark_default', application= '/home/ubuntu/anaconda3/envs/airflow/lib/python3.6/site-packages/pyspark/examples/src/main/python/pi.py', total_executor_cores=4, executor_cores=2, executor_memory='1g', driver_memory='1g', name='flight_search_ingestion', execution_timeout=timedelta(seconds=100000), dag=dag) run_this_last >> flight_search_ingestion if __name__ == "__main__": dag.cli()
eur DOUBLE, usd DOUBLE, nzd DOUBLE, gbp DOUBLE, jpy DOUBLE, cad DOUBLE ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE """) #Spark operator forex_processing = SparkSubmitOperator( task_id="forex_processing", application="/home/enes/airflow2/dags/scripts/forex_processing.py", conn_id="spark_conn", verbose=False) #Email operator send_email_notification = EmailOperator( task_id="send_email_notification", to="*****@*****.**", subject="forex_data_pipeline", html_content="<h3>forex_data_pipeline</h3>") #Slack notification operator send_slack_notification = SlackWebhookOperator( task_id="send_slack_notification", http_conn_id="slack_conn", message=_get_message(),
python_callable=load_s3, op_kwargs={'bucket_name': 'mybucket-test2', 'source_file_path': 'source_data/user_info.csv', 'dest_aws_file_name':'users_info/user_info.csv'}, dag=mydag, ) task3 = PythonOperator( task_id='load_s3_3', python_callable=load_s3, op_kwargs={'bucket_name': 'mybucket-test2', 'source_file_path': 'source_data/prices_1.csv', 'dest_aws_file_name':'prices/prices_1.csv'}, dag=mydag, ) task4 = PythonOperator( task_id='load_s3_4', python_callable=load_s3, op_kwargs={'bucket_name': 'mybucket-test2', 'source_file_path': 'source_data/sales_per_user.csv', 'dest_aws_file_name':'sales/sales_per_user.csv'}, dag=mydag, ) task5 = SparkSubmitOperator( task_id='task_aws_s3_pyspark', application='s3redshift.py', dag=mydag, packages='com.amazon.redshift:redshift-jdbc42-no-awssdk:1.2.45.1069,com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-auth:2.7.4,org.apache.hadoop:hadoop-common:2.7.4,com.google.code.findbugs:jsr305:3.0.2,asm:asm:3.2,org.slf4j:slf4j-api:1.7.30,org.xerial.snappy:snappy-java:1.1.7.5,org.slf4j:slf4j-log4j12:1.7.30,org.apache.hadoop:hadoop-aws:2.7.3', conn_id= 'my_spark_standalone' ) #----dependencies [task1,task2,task3,task4] >> task5
with dag: env = { 'SPLICE_JUPYTER_USER': env_vars.get('SPLICE_JUPYTER_USER') or env_vars.get('DB_USER'), 'SPLICE_JUPYTER_PASSWORD': env_vars.get('SPLICE_JUPYTER_PASSWORD') or env_vars.get('DB_PASSWORD'), 'SPLICE_DB_HOST': env_vars.get('SPLICE_DB_HOST') or env_vars.get('DB_HOST'), 'SPLICE_KAFKA_HOST': env_vars.get('SPLICE_KAFKA_HOST') } conf_path = '/mnt/airflow-conf/extra_spark_config.json' if path.exists(conf_path): with open(conf_path) as f: extra_conf = json.load(f) else: extra_conf = {} calculate_statistics_task = SparkSubmitOperator( application="/opt/airflow/spark_apps/pipeline.py", task_id="run_pipeline", conn_id="splice_spark", env_vars=env, application_args=[fset], **spark_defaults, **extra_conf) globals()[dag_id] = dag