start >> sftp_file_to_container_hdfs >> put_file_in_hdfs >> hito_files_hdfs entrenamiento_modelo = SparkSubmitOperator( task_id="entrenamiento_modelo", application= "/usr/local/spark/app/construccion_modelo.py", # Spark application path created in airflow and spark cluster name="entrenamiento_modelos", conn_id="spark_default", verbose=1, application_args=[file_path], env_vars={'HADOOP_USER_NAME': 'root'}, dag=dag) evaluacion_modelo = SparkSubmitOperator( task_id="evaluacion_modelo", application= "/usr/local/spark/app/evaluacion_modelos.py", # Spark application path created in airflow and spark cluster name="evaluacion_modelos", conn_id="spark_default", verbose=1, application_args=[file_path], env_vars={'HADOOP_USER_NAME': 'root'}, dag=dag) end = DummyOperator(task_id="end", dag=dag) entrenamiento_modelo.set_upstream(hito_files_hdfs) evaluacion_modelo.set_upstream(entrenamiento_modelo) end.set_upstream(evaluacion_modelo) evaluacion_modelo
from airflow import DAG from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator from datetime import datetime dag = DAG("MyFirstApp3", default_args={"owner": "airflow", "start_date": datetime.strptime("01/01/2018", "%d/%m/%Y")}, schedule_interval="@once") operator_args = {'depends_on_past': False, 'conn_id': 'spark_con_py', 'conf': {'spark.pyspark.python': '/usr/bin/python2.7'}} Task_MyFirstApp3_task1 = SparkSubmitOperator(task_id="MyFirstApp3_task1", application="/usr/local/spark_code/MyFirstApp3_task1.py", dag=dag, **operator_args) operator_args = {'depends_on_past': False, 'conn_id': 'spark_con_py', 'conf': {'spark.pyspark.python': '/usr/bin/python2.7'}} Task_MyFirstApp3_task2 = SparkSubmitOperator(task_id="MyFirstApp3_task2", application="/usr/local/spark_code/MyFirstApp3_task2.py", dag=dag, **operator_args) Task_MyFirstApp3_task2.set_upstream(Task_MyFirstApp3_task1)
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } etl_dag = DAG('etl_user_place_profiles', default_args=default_args, schedule_interval='0 */2 * * *', start_date=days_ago(1)) download = PythonOperator(task_id='download_files', dag=etl_dag, python_callable=download_files_from_s3) etl_place_details = SparkSubmitOperator(task_id='etl_place_details', dag=etl_dag, application=SCRIPTS_DIR + 'etl_place_details.py', conn_id='spark_local') etl_user_details = SparkSubmitOperator(task_id='etl_user_details', dag=etl_dag, application=SCRIPTS_DIR + 'etl_user_details.py', conn_id='spark_local') etl_place_details.set_upstream(download) etl_user_details.set_upstream(download)