def test_execute(self, conn_id='spark_default'):
        operator = SparkSubmitOperator(task_id='spark_submit_job',
                                       dag=self.dag,
                                       **self._config)

        self.assertEqual(conn_id, operator._conn_id)

        self.assertEqual(self._config['application'], operator._application)
        self.assertEqual(self._config['conf'], operator._conf)
        self.assertEqual(self._config['files'], operator._files)
        self.assertEqual(self._config['py_files'], operator._py_files)
        self.assertEqual(self._config['jars'], operator._jars)
        self.assertEqual(self._config['executor_cores'],
                         operator._executor_cores)
        self.assertEqual(self._config['executor_memory'],
                         operator._executor_memory)
        self.assertEqual(self._config['keytab'], operator._keytab)
        self.assertEqual(self._config['principal'], operator._principal)
        self.assertEqual(self._config['name'], operator._name)
        self.assertEqual(self._config['num_executors'],
                         operator._num_executors)
        self.assertEqual(self._config['verbose'], operator._verbose)
        self.assertEqual(self._config['java_class'], operator._java_class)
        self.assertEqual(self._config['driver_memory'],
                         operator._driver_memory)
        self.assertEqual(self._config['application_args'],
                         operator._application_args)
Beispiel #2
0
    def _get_test_dag(self):
        with DAG(dag_id='test_dag', default_args=DEFAULT_DAG_ARGS) as dag:
            op1 = SparkSubmitOperator(task_id='op1')
            op2 = EmrAddStepsOperator(task_id='op2', job_flow_id='foo')
            op3 = S3ListOperator(task_id='op3', bucket='foo')
            op4 = EmrCreateJobFlowOperator(task_id='op4')
            op5 = TriggerDagRunOperator(task_id='op5', trigger_dag_id='foo')
            op6 = FileToWasbOperator(task_id='op6',
                                     container_name='foo',
                                     blob_name='foo',
                                     file_path='foo')
            op7 = EmailOperator(task_id='op7',
                                subject='foo',
                                to='foo',
                                html_content='foo')
            op8 = S3CopyObjectOperator(task_id='op8',
                                       dest_bucket_key='foo',
                                       source_bucket_key='foo')
            op9 = BranchPythonOperator(task_id='op9', python_callable=print)
            op10 = PythonOperator(task_id='op10', python_callable=range)

            op1 >> [op2, op3, op4]
            op2 >> [op5, op6]
            op6 >> [op7, op8, op9]
            op3 >> [op7, op8]
            op8 >> [op9, op10]

        return dag
Beispiel #3
0
def test_spark_dag(mock_subproc_popen):
    # Hack to get around having a Connection
    os.environ["AIRFLOW_CONN_SPARK"] = "something"

    dag = DAG(
        dag_id="spark_dag",
        default_args=default_args,
        schedule_interval=None,
    )
    # pylint: disable=unused-variable
    clean_data = SparkSubmitOperator(
        task_id="run_spark",
        application="some_path.py",
        conn_id="SPARK",
        dag=dag,
    )

    pipeline = make_dagster_pipeline_from_airflow_dag(
        dag=dag,
        tags={
            AIRFLOW_EXECUTION_DATE_STR:
            get_current_datetime_in_utc().isoformat()
        },
    )
    execute_pipeline(pipeline)  # , instance=instance,)

    assert mock_subproc_popen.call_args_list[0][0] == ([
        "spark-submit", "--master", "", "--name", "airflow-spark",
        "some_path.py"
    ], )
Beispiel #4
0
    def test_render_template(self):
        # Given
        operator = SparkSubmitOperator(task_id='spark_submit_job',
                                       dag=self.dag,
                                       **self._config)
        ti = TaskInstance(operator, DEFAULT_DATE)

        # When
        ti.render_templates()

        # Then
        expected_application_args = [
            '-f',
            'foo',
            '--bar',
            'bar',
            '--start',
            (DEFAULT_DATE - timedelta(days=1)).strftime("%Y-%m-%d"),
            '--end',
            DEFAULT_DATE.strftime("%Y-%m-%d"),
            '--with-spaces',
            'args should keep embdedded spaces',
        ]
        expected_name = 'spark_submit_job'
        self.assertListEqual(expected_application_args,
                             getattr(operator, '_application_args'))
        self.assertEqual(expected_name, getattr(operator, '_name'))
Beispiel #5
0
 def spark_submit_operator(self, dag):
     operator = SparkSubmitOperator(
         task_id="spark_submit_task",
         application="script.py",
         application_args=["input.csv", "output.csv"],
         dag=dag,
     )
     track_dag(dag)
     return operator
def sparkOperator(file, task_id, executor_cores=5, num_executors=10, **kwargs):
    return SparkSubmitOperator(
        application='/home/airflow/airflow-apps/dlpredictor/{}'.format(file),
        application_args=[],
        conn_id='spark_default',
        executor_memory='32G',
        conf={'spark.driver.maxResultSize': '8g'},
        driver_memory='32G',
        executor_cores=executor_cores,
        num_executors=num_executors,
        task_id=task_id,
        dag=dag,
        **kwargs)
Beispiel #7
0
def sparkOperator(file, task_id, **kwargs):
    return SparkSubmitOperator(
        application='/home/airflow/airflow/din_model/pipeline/{}'.format(file),
        application_args=['/home/airflow/airflow/din_model/config.yml'],
        conn_id='spark_default',
        executor_memory='32G',
        conf={'spark.driver.maxResultSize': '4g'},
        driver_memory='32G',
        executor_cores=5,
        num_executors=20,
        task_id=task_id,
        dag=dag,
        **kwargs)
    def execute(self, context):
        dag = DAG(self.task_id)
        task_text = '{}'.format(self.task_id)
        self.log.info(f"SPARK: Executing " + task_text)

        _config = {
            'application': self.application,
            'master': self.master,
            'deploy-mode': self.deploy_mode,
            'executor_cores': self.executor_cores,
            'EXECUTORS_MEM': self.executor_memory
        }

        SparkSubmitOperator(task_id=self.task_id, dag=dag, **_config)
Beispiel #9
0
    def spark_submit_operator(self, dag):
        operator = SparkSubmitOperator(
            task_id="spark_submit_task",
            application="script.py",
            application_args=["input.csv", "output.csv"],
            dag=dag,
        )

        env = {
            "AIRFLOW_CTX_DAG_ID": "test_dag",
            "AIRFLOW_CTX_EXECUTION_DATE": "spark_submit_task",
            "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000",
            "AIRFLOW_CTX_TRY_NUMBER": "1",
        }

        add_tracking_to_submit_task(env, operator)
        return operator
Beispiel #10
0
    def spark_submit_operator(self, dag):
        operator = SparkSubmitOperator(
            task_id="spark_submit_task",
            application="script.py",
            application_args=["input.csv", "output.csv"],
            dag=dag,
        )

        env = {
            "AIRFLOW_CTX_DAG_ID": "test_dag",
            "AIRFLOW_CTX_EXECUTION_DATE": "spark_submit_task",
            "AIRFLOW_CTX_TASK_ID": "1970-01-01T0000.000",
            "AIRFLOW_CTX_TRY_NUMBER": "1",
            "AIRFLOW_CTX_UID": get_airflow_instance_uid(),
        }

        with wrap_operator_with_tracking_info(env, operator):
            return operator
Beispiel #11
0
def sparkOperator(file, task_id, **kwargs):
    return SparkSubmitOperator(
        application=
        '/home/airflow/airflow-apps/lookalike-model/lookalike_model/application/pipeline/{}'
        .format(file),
        application_args=[
            '/home/airflow/airflow-apps/lookalike-model/lookalike_model/application/pipeline/config.yml'
        ],
        conn_id='spark_default',
        executor_memory='8G',
        conf={
            'spark.driver.maxResultSize': '5g',
            'spark.hadoop.hive.exec.dynamic.partition': True,
            'spark.hadoop.hive.exec.dynamic.partition.mode': 'nonstrict'
        },
        driver_memory='8G',
        executor_cores=5,
        num_executors=20,
        task_id=task_id,
        dag=dag,
        **kwargs)
Beispiel #12
0
    def test_render_template(self):
        # Given
        operator = SparkSubmitOperator(task_id='spark_submit_job',
                                       dag=self.dag,
                                       **self._config)
        ti = TaskInstance(operator, DEFAULT_DATE)

        # When
        ti.render_templates()

        # Then
        expected_application_args = [
            u'-f foo', u'--bar bar',
            u'--start %s' %
            (DEFAULT_DATE - datetime.timedelta(days=1)).strftime("%Y-%m-%d"),
            u'--end %s' % DEFAULT_DATE.strftime("%Y-%m-%d")
        ]
        expected_name = "spark_submit_job"
        self.assertListEqual(sorted(expected_application_args),
                             sorted(getattr(operator, '_application_args')))
        self.assertEqual(expected_name, getattr(operator, '_name'))
Beispiel #13
0
    def transform(self, subdag: nx.DiGraph,
                  parent_fragment: DAGFragment) -> DAGFragment:
        subdag_roots = [n for n, d in subdag.in_degree() if d == 0]
        first_root = subdag_roots[0].task_id

        task_id_prefix = '' if first_root in ['op2', 'op3'] else '2'

        TestSubDagTransformer1.op1 = SparkSubmitOperator(
            task_id=f"t{task_id_prefix}p1", dag=self.dag)
        TestSubDagTransformer1.op2 = EmrAddStepsOperator(
            task_id=f"t{task_id_prefix}p2", job_flow_id='foo', dag=self.dag)
        TestSubDagTransformer1.op3 = S3ListOperator(
            task_id=f"t{task_id_prefix}p3", bucket='foo', dag=self.dag)
        TestSubDagTransformer1.op4 = EmrCreateJobFlowOperator(
            task_id=f"t{task_id_prefix}p4", dag=self.dag)
        TestSubDagTransformer1.op5 = DummyOperator(
            task_id=f"t{task_id_prefix}p5", dag=self.dag)

        TestSubDagTransformer1.op1 >> [
            TestSubDagTransformer1.op2, TestSubDagTransformer1.op3
        ] >> TestSubDagTransformer1.op4

        return DAGFragment(
            [TestSubDagTransformer1.op1, TestSubDagTransformer1.op5])
Beispiel #14
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'start_date': datetime.now() - timedelta(minutes=20),
    'retries': 5,
    'retry_delay': timedelta(minutes=1),
    'dagrun_timeout': timedelta(minutes=5)
}

with DAG('batch_pipeline',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:
    # Define task clean, running a cleaning job.

    t1 = BashOperator(task_id='print_current_date', bash_command='date')

    t2 = BashOperator(
        task_id='print_job_started',
        bash_command=
        'echo "******* *** *** Spark Batch Job Has Started ********************"'
    )

    flat_obs = SparkSubmitOperator(application=entry_point,
                                   verbose=True,
                                   task_id='flat_obs',
                                   conn_id='spark_default')

    t3 = BashOperator(task_id='print_hello', bash_command='echo "hello world"')

    t1 >> t2 >> flat_obs >> t3
Beispiel #15
0
def first_function_execute(**context):
    print("HELLO ")


def second_function_execute(**context):
    print("Is it me you looking for")


default_args = {
    "owner": "airflow",
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    "depends_on_past": False,
    "start_date": datetime(2021, 1, 1),
}

with DAG(dag_id="spark2",
         schedule_interval="@once",
         default_args=default_args,
         catchup=False) as f:

    first_f = PythonOperator(task_id="first",
                             python_callable=first_function_execute,
                             provide_context=True,
                             op_kwargs={"name": "Soumil Shah"})

    spark_submit_task1 = SparkSubmitOperator(task_id='spark_submit_job',
                                             conn_id='spark_default')

first_f >> spark_submit_task1
  dag=dag)

# schedule spark jobs via airflow
# Import the operator
from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator

# Set the path for our files.
entry_point = os.path.join(os.environ["AIRFLOW_HOME"], "scripts", "clean_ratings.py")
dependency_path = os.path.join(os.environ["AIRFLOW_HOME"], "dependencies", "pydiaper.zip")

with DAG('data_pipeline', start_date=datetime(2019, 6, 25),
         schedule_interval='@daily') as dag:
    # Define task clean, running a cleaning job.
    clean_data = SparkSubmitOperator(
        application=entry_point, 
        py_files=dependency_path,
        task_id='clean_data',
        conn_id='spark_default')

# deploy pipeline
spark_args = {"py_files": dependency_path,
              "conn_id": "spark_default"}
# Define ingest, clean and transform job.
with dag:
    ingest = BashOperator(task_id='Ingest_data', bash_command='tap-marketing-api | target-csv --config %s' % config)
    clean = SparkSubmitOperator(application=clean_path, task_id='clean_data', **spark_args)
    insight = SparkSubmitOperator(application=transform_path, task_id='show_report', **spark_args)
    
    # set triggering sequence
    ingest >> clean >> insight
Beispiel #17
0
    def test_execute(self):

        # Given / When
        conn_id = 'spark_default'
        operator = SparkSubmitOperator(task_id='spark_submit_job',
                                       spark_binary="sparky",
                                       dag=self.dag,
                                       **self._config)

        # Then expected results
        expected_dict = {
            'conf': {
                'parquet.compression': 'SNAPPY'
            },
            'files':
            'hive-site.xml',
            'py_files':
            'sample_library.py',
            'archives':
            'sample_archive.zip#SAMPLE',
            'driver_class_path':
            'parquet.jar',
            'jars':
            'parquet.jar',
            'packages':
            'com.databricks:spark-avro_2.11:3.2.0',
            'exclude_packages':
            'org.bad.dependency:1.0.0',
            'repositories':
            'http://myrepo.org',
            'total_executor_cores':
            4,
            'executor_cores':
            4,
            'executor_memory':
            '22g',
            'keytab':
            'privileged_user.keytab',
            'principal':
            'user/[email protected]',
            'proxy_user':
            '******',
            'name':
            '{{ task_instance.task_id }}',
            'num_executors':
            10,
            'verbose':
            True,
            'application':
            'test_application.py',
            'driver_memory':
            '3g',
            'java_class':
            'com.foo.bar.AppMain',
            'application_args': [
                '-f',
                'foo',
                '--bar',
                'bar',
                '--start',
                '{{ macros.ds_add(ds, -1)}}',
                '--end',
                '{{ ds }}',
                '--with-spaces',
                'args should keep embdedded spaces',
            ],
            'spark_binary':
            'sparky'
        }

        self.assertEqual(conn_id, operator._conn_id)
        self.assertEqual(expected_dict['application'], operator._application)
        self.assertEqual(expected_dict['conf'], operator._conf)
        self.assertEqual(expected_dict['files'], operator._files)
        self.assertEqual(expected_dict['py_files'], operator._py_files)
        self.assertEqual(expected_dict['archives'], operator._archives)
        self.assertEqual(expected_dict['driver_class_path'],
                         operator._driver_class_path)
        self.assertEqual(expected_dict['jars'], operator._jars)
        self.assertEqual(expected_dict['packages'], operator._packages)
        self.assertEqual(expected_dict['exclude_packages'],
                         operator._exclude_packages)
        self.assertEqual(expected_dict['repositories'], operator._repositories)
        self.assertEqual(expected_dict['total_executor_cores'],
                         operator._total_executor_cores)
        self.assertEqual(expected_dict['executor_cores'],
                         operator._executor_cores)
        self.assertEqual(expected_dict['executor_memory'],
                         operator._executor_memory)
        self.assertEqual(expected_dict['keytab'], operator._keytab)
        self.assertEqual(expected_dict['principal'], operator._principal)
        self.assertEqual(expected_dict['proxy_user'], operator._proxy_user)
        self.assertEqual(expected_dict['name'], operator._name)
        self.assertEqual(expected_dict['num_executors'],
                         operator._num_executors)
        self.assertEqual(expected_dict['verbose'], operator._verbose)
        self.assertEqual(expected_dict['java_class'], operator._java_class)
        self.assertEqual(expected_dict['driver_memory'],
                         operator._driver_memory)
        self.assertEqual(expected_dict['application_args'],
                         operator._application_args)
        self.assertEqual(expected_dict['spark_binary'], operator._spark_binary)
Beispiel #18
0
    # Step 2: Move json file to hdfs storage
    move_to_hdfs = BashOperator(task_id="move_to_hdfs",
                                bash_command="""
            hdfs dfs -mkdir -p /dim_sku && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/sku_data.csv /dim_sku
            """)

    # Step 3: Create a hive table on our sku_data
    creating_sku_table = HiveOperator(task_id="creating_sku_table",
                                      hive_cli_conn_id="hive_conn",
                                      hql="""
            CREATE EXTERNAL TABLE IF NOT EXISTS dim_sku(
                asin STRING,
                title STRING,
                price DOUBLE,
                brand STRING
                )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '|'
            STORED AS TEXTFILE
        """)

    processing_sku_data = SparkSubmitOperator(
        task_id="processing_sku_data",
        conn_id="spark_conn",
        application="/usr/local/airflow/dags/scripts/dim_sku_processing.py",
        verbose=False)

    unzip_file_store_as_csv >> move_to_hdfs >> creating_sku_table >> processing_sku_data
Beispiel #19
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 6, 22),
    'schedule_interval': None,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(seconds=30),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

with DAG("HELLO", catchup=False, default_args=default_args) as dag:

    t1 = BashOperator(
        task_id='print_date',
        bash_command='date',
    )

    t2 = SparkSubmitOperator(
        task_id="run_spark_job",
        application=f"{os.environ['AIRFLOW__CORE__DAGS_FOLDER']}/find_pi.py",
    )

t2 >> t1

Beispiel #20
0
spark_config = {
    'conn_id': 'spark_local',
    'java_class': 'com.spark.airflow.test_spark_airflow',
    'application':
    '/Users/ravimuthyala/AirflowSparkTestCode/sparkairflowtest_2.12-0.1.jar',
    'jars': '/Users/ravimuthyala/AirflowSparkTestCode/postgresql-42.2.12.jar',
    'application_args':
    ["/Users/ravimuthyala/AirflowSparkTestCode/receipts.csv"],
    'driver_memory': '1g',
    'executor_cores': 1,
    'num_executors': 1,
    'executor_memory': '1g'
}

spark_submit_operator = SparkSubmitOperator(task_id='Spark_Scala_Submit_Job',
                                            dag=dag,
                                            **spark_config)

emailNotify = EmailOperator(task_id='email_notification',
                            to='*****@*****.**',
                            subject='Spark Submit Job Alert',
                            html_content='Airflow Spark Submit Job Done',
                            dag=dag)

t1Failed = EmailOperator(dag=dag,
                         trigger_rule=TriggerRule.ONE_FAILED,
                         task_id="SparkJobFailed",
                         to=["*****@*****.**"],
                         subject="Spark job Failed",
                         html_content='<h3>Spark job has failed</h3>')
Beispiel #21
0
from airflow.models import DAG
from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
from datetime import datetime

default_args = {
    'owner': 'airflow',
    'start_date': datetime(2019, 1, 1)
}

dag = DAG('stackoverflow_stats', default_args=default_args,
          schedule_interval='@daily')

SparkSubmitOperator(
    task_id='get-stats',
    application="/usr/local/airflow/jobs/stats.py",
    dag=dag,
    run_as_user='******',
    application_args=['--date', '{{ ds }}'],
    name='Stats DAG for {{ ds }}',
    num_executors=2,
    executor_memory='2g'
)
Beispiel #22
0
    'spark.hadoop.fs.s3a.impl':
    'org.apache.hadoop.fs.s3a.S3AFileSystem',
    'spark.hadoop.fs.s3a.access.key':
    os.environ.get('AWS_ACCESS_KEY_ID', ''),
    'spark.hadoop.fs.s3a.secret.key':
    os.environ.get('AWS_SECRET_ACCESS_KEY', ''),
    'spark.hadoop.fs.s3a.endpoint':
    "{}:{}".format(os.environ.get('AWS_SERVER', ''),
                   os.environ.get('AWS_PORT', '')),
    'spark.hadoop.fs.s3a.connection.ssl.enabled':
    'false',
    'spark.hadoop.fs.s3a.path.style.access':
    'true',
    'spark.hadoop.fs.s3.impl':
    'org.apache.hadoop.fs.s3a.S3AFileSystem'
}

spark = SparkSubmitOperator(task_id='fetch_csv_from_s3_and_update_postgres',
                            dag=dag,
                            conf=spark_conf,
                            application='{spark_dir}/s3topostgres.py'.format(
                                spark_dir=SPARK_DIRECTORY),
                            application_args=['-f', FILE, '-t', TABLE])

check = CheckOperator(task_id='check_demo_contains_data',
                      conn_id='local_pg',
                      sql='SELECT COUNT(*) FROM {table}'.format(table=TABLE),
                      dag=dag)

spark >> check
Beispiel #23
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'din_model_integration',
    default_args=default_args,
    schedule_interval=None,
)

clean = SparkSubmitOperator(
    application='/home/wei2/airflow/din_model/pipeline/main_clean.py',
    application_args=['/home/wei2/airflow/din_model/config.yml'],
    conn_id='spark_default',
    executor_memory='16G',
    driver_memory='16G',
    executor_cores=5,
    num_executors=20,
    task_id='din_clean',
    dag=dag,
)

process = SparkSubmitOperator(
    application='/home/wei2/airflow/din_model/pipeline/main_processing.py',
    application_args=['/home/wei2/airflow/din_model/config.yml'],
    conn_id='spark_default',
    executor_memory='16G',
    driver_memory='16G',
    executor_cores=5,
    num_executors=20,
    task_id='din_processing',
Beispiel #24
0
                  'log.kafka.topic': 'druid-kafka-proxy',
                  'log.kafka.create': 'true',
                  'log.kafka.servers': 'soctxadev01.gsoc.verizon.com:6667',
                  'log.rowcase.headers': 'cs_username:lower,vzid:lower,x_cs_auth_domain:upper,x_exception_id:lower,sc_filter_result:uppper,cs_referer:lower,sc_status:upper,s_action:upper,cs_method:upper,rs_content_type:lower,cs_uri_scheme:lower,cs_host:lower,cs_uri_path:lower,cs_uri_query:lower,cs_uri_extension:lower,cs_user_agent:lower,x_bluecoat_application_name:lower,x_bluecoat_application_operation:lower,cs_categories:lower,cs_auth_group:lower',
                  'log.partition.by.date': 'true'
              }.items() + [v.split("=") for v in OTHER_PARAM_OVERRIDES.split(",")])

spark_submit_task = SparkSubmitOperator(
    task_id='spark_submit_job',
    conn_id='spark_default',
    java_class='com.verizon.gsoc.datasources.phoenix.Phoenix',
    application=EXECUTABLE_PATH,
    # application_args=[' '.join(['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()])],
    application_args=['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()],
    total_executor_cores='1',
    executor_cores='1',
    executor_memory='2g',
    num_executors='2',
    name='spark-airflow-phoenix',
    verbose=True,
    driver_memory='1g',
    xcom_push='true',
    conf=config,
    dag=dag,
)

def print_hello():
    return 'Finally it worked!!!!' + str(datetime.now().strftime("%m%d%Y-%H%M"))

def print_check():
    return 'Finally it worked!!!!' + str(datetime.now().strftime("%m%d%Y-%H%M"))
    task_id='load_dim_visa_type',
    dag=dag,
    table="dim_visa_type",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="i94project",
    s3_key="dicts/visa_type.csv",
    copy_options=("CSV", "REGION 'us-west-2'", "IGNOREHEADER 1")
)

# process us airport in spark
process_dim_us_airport = SparkSubmitOperator(
    application=r"/usr/local/airflow/plugins/helpers/spark_dispatch.py",
    application_args=[r"process_airport",  # command
                      r"s3a://i94project/dicts",  # dictionaries
                      r"s3a://i94project/stage/input/airport-codes.csv",  # input_path
                      r"s3a://i94project/stage/output/airport.parquet"],  # output_path
    task_id="process_dim_us_airport",
    packages=spark_packages,
    dag=dag
)

# load dim_us_airport table
load_dim_us_airport = StageToRedshiftOperator(
    task_id='load_dim_us_airport',
    dag=dag,
    table="dim_us_airport",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="i94project",
    s3_key="stage/output/airport.parquet",
    # copy_options=("CSV", "REGION 'us-west-2'", "IGNOREHEADER 1")
Beispiel #26
0
    max_active_runs=1)

start_operator = DummyOperator(task_id='begin_execution', dag=dag)

download_git_data = PythonOperator(task_id="download_git_data",
                                   python_callable=download_data,
                                   dag=dag,
                                   provide_context=True)

spark_config = {
    'conn_id': config.get('HOST', 'SPARK_CONN'),
    'application': config.get('HOST', 'SPARK_APP')
}

spark_process = SparkSubmitOperator(task_id="spark_submit",
                                    dag=dag,
                                    **spark_config)

del_json_task = BashOperator(
    task_id="delete_old_data",
    bash_command='rm -r ' + home_dir +
    '/"{{ (execution_date - macros.timedelta(days=3)).strftime("%Y-%m-%d") }}"',
)

del_crc_task = BashOperator(
    task_id="delete_crc_data",
    bash_command='find ' + home_dir +
    '/git_{{ (execution_date - macros.timedelta(days=2)).strftime("%Y-%m-%d") }}.parquet/ -name "*.crc" -exec rm \'{}\' \;',
)

del_suc_task = BashOperator(
Beispiel #27
0
    task_id='add_partition_title_basics_table',
    hql=hiveSQL_add_partition_title_basics,
    hive_cli_conn_id='beeline',
    dag=dag)

dummy_op = DummyOperator(task_id='dummy', dag=dag)

pyspark_top_tvseries = SparkSubmitOperator(
    task_id='pyspark_write_top_tvseries_to_final',
    conn_id='spark',
    application='/home/airflow/airflow/python/pyspark_top_tvseries.py',
    total_executor_cores='2',
    executor_cores='2',
    executor_memory='2g',
    num_executors='2',
    name='spark_calculate_top_tvseries',
    verbose=True,
    application_args=[
        '--year', '{{ macros.ds_format(ds, "%Y-%m-%d", "%Y")}}', '--month',
        '{{ macros.ds_format(ds, "%Y-%m-%d", "%m")}}', '--day',
        '{{ macros.ds_format(ds, "%Y-%m-%d", "%d")}}', '--hdfs_source_dir',
        '/user/hadoop/imdb', '--hdfs_target_dir',
        '/user/hadoop/imdb_final/top_tvseries', '--hdfs_target_format', 'csv'
    ],
    dag=dag)

create_table_for_top_tvseries = HiveOperator(
    task_id='create_top_tvseries_external_table',
    hql=hiveSQL_create_top_tvseries_external_table,
    hive_cli_conn_id='beeline',
    dag=dag)
Beispiel #28
0
DEFAULT_DATE = timezone.datetime(2017, 1, 1)
srcDir = os.getcwd() + '/dags/repo/examples/hello_2.11-1.0.jar'
args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
dag = DAG('sparkjob',
          default_args=args,
          schedule_interval='@monthly',
          dagrun_timeout=timedelta(minutes=60))

spark_task = BashOperator(
    task_id='spark_java_bash',
    bash_command='spark-submit --class {{ params.class }} {{ params.jar }}',
    params={
        'class': 'hello',
        'jar': srcDir
    },
    dag=dag)

_config = {
    'application': srcDir,
    'master': 'local',
    'deploy-mode': 'cluster',
    'executor_cores': 1,
    'EXECUTORS_MEM': '1G'
}

operator = SparkSubmitOperator(task_id='spark_submit_op_job',
                               dag=dag,
                               java_class='hello',
                               **_config)

operator >> spark_task
Beispiel #29
0
          schedule_interval=None,
          default_args=default_args,
          user_defined_macros=conf.__dict__)

# 定义一个空任务作为起始任务
start = DummyOperator(task_id='start', queue='script', dag=dag)

# 使用SparkSubmitOperator定义执行spark作业的任务
# 和作业有关的所有参数和配置都在_config字段中定义
# tk-dev-emr-airflow-spark是定义的emr spark连接,该连接需要在airflow admin web页面定义
# application_args可用于指定pyspark脚本中自定义的参数
_config = {
    'name': '{{ ti.task_id }}',
    'application': '/server/airflow/dags/testoperator/wordcount.py',
    'executor_cores': 2,
    'executor_memory': '12g',
    'application_args': [
        '-fid',
        '{{ ti.job_id }}',
    ]
}

spark_task1 = SparkSubmitOperator(task_id='spark_task1',
                                  conn_id='tk_dev_dw_spark',
                                  queue='script',
                                  dag=dag,
                                  **_config)

# 定义任务之间依赖关系
start >> spark_task1
    saving_rates = BashOperator(task_id="saving_rates",
                                bash_command="""
            hdfs dfs -mkdir -p /forex && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex
        """)

    creating_forex_rates_table = HiveOperator(
        task_id="creating_forex_rates_table",
        hive_cli_conn_id="hive_conn",
        hql="""
            CREATE EXTERNAL TABLE IF NOT EXISTS forex_rates(
                base STRING,
                last_update DATE,
                eur DOUBLE,
                usd DOUBLE,
                nzd DOUBLE,
                gbp DOUBLE,
                jpy DOUBLE,
                cad DOUBLE
                )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS TEXTFILE
    """)

    forex_processing = SparkSubmitOperator(
        task_id="forex_processing",
        conn_id="spark_conn",
        application="/usr/local/airflow/dags/scripts/forex_processing.py",
        verbose=False)