start >> sftp_file_to_container_hdfs >> put_file_in_hdfs >> hito_files_hdfs

entrenamiento_modelo = SparkSubmitOperator(
    task_id="entrenamiento_modelo",
    application=
    "/usr/local/spark/app/construccion_modelo.py",  # Spark application path created in airflow and spark cluster
    name="entrenamiento_modelos",
    conn_id="spark_default",
    verbose=1,
    application_args=[file_path],
    env_vars={'HADOOP_USER_NAME': 'root'},
    dag=dag)

evaluacion_modelo = SparkSubmitOperator(
    task_id="evaluacion_modelo",
    application=
    "/usr/local/spark/app/evaluacion_modelos.py",  # Spark application path created in airflow and spark cluster
    name="evaluacion_modelos",
    conn_id="spark_default",
    verbose=1,
    application_args=[file_path],
    env_vars={'HADOOP_USER_NAME': 'root'},
    dag=dag)

end = DummyOperator(task_id="end", dag=dag)

entrenamiento_modelo.set_upstream(hito_files_hdfs)
evaluacion_modelo.set_upstream(entrenamiento_modelo)
end.set_upstream(evaluacion_modelo)
evaluacion_modelo
Beispiel #2
0
from airflow import DAG
from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from datetime import datetime

dag = DAG("MyFirstApp3", default_args={"owner": "airflow", "start_date": datetime.strptime("01/01/2018", "%d/%m/%Y")}, schedule_interval="@once")

operator_args = {'depends_on_past': False, 'conn_id': 'spark_con_py', 'conf': {'spark.pyspark.python': '/usr/bin/python2.7'}}
Task_MyFirstApp3_task1 = SparkSubmitOperator(task_id="MyFirstApp3_task1", application="/usr/local/spark_code/MyFirstApp3_task1.py", dag=dag, **operator_args)

operator_args = {'depends_on_past': False, 'conn_id': 'spark_con_py', 'conf': {'spark.pyspark.python': '/usr/bin/python2.7'}}
Task_MyFirstApp3_task2 = SparkSubmitOperator(task_id="MyFirstApp3_task2", application="/usr/local/spark_code/MyFirstApp3_task2.py", dag=dag, **operator_args)



Task_MyFirstApp3_task2.set_upstream(Task_MyFirstApp3_task1)
Beispiel #3
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

etl_dag = DAG('etl_user_place_profiles',
              default_args=default_args,
              schedule_interval='0 */2 * * *',
              start_date=days_ago(1))

download = PythonOperator(task_id='download_files',
                          dag=etl_dag,
                          python_callable=download_files_from_s3)

etl_place_details = SparkSubmitOperator(task_id='etl_place_details',
                                        dag=etl_dag,
                                        application=SCRIPTS_DIR +
                                        'etl_place_details.py',
                                        conn_id='spark_local')

etl_user_details = SparkSubmitOperator(task_id='etl_user_details',
                                       dag=etl_dag,
                                       application=SCRIPTS_DIR +
                                       'etl_user_details.py',
                                       conn_id='spark_local')

etl_place_details.set_upstream(download)
etl_user_details.set_upstream(download)