def test_hiveconf_jinja_translate(self):
     hql = "SELECT ${num_col} FROM ${hiveconf:table};"
     op = HiveOperator(
         hiveconf_jinja_translate=True,
         task_id='dry_run_basic_hql', hql=hql, dag=self.dag)
     op.prepare_template()
     self.assertEqual(op.hql, "SELECT {{ num_col }} FROM {{ table }};")
Example #2
0
    def test_beeline(self, mock_popen, mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "tst"

        hive_cmd = [
            'beeline', '-u', '"jdbc:hive2://localhost:10000/default"',
            '-hiveconf', 'airflow.ctx.dag_id=test_dag_id', '-hiveconf',
            'airflow.ctx.task_id=beeline_hql', '-hiveconf',
            'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00',
            '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf',
            'airflow.ctx.dag_owner=airflow', '-hiveconf',
            'airflow.ctx.dag_email=', '-hiveconf',
            'mapreduce.job.queuename=airflow', '-hiveconf',
            'mapred.job.queue.name=airflow', '-hiveconf',
            'tez.queue.name=airflow', '-hiveconf',
            'mapred.job.name=test_job_name', '-f',
            '/tmp/airflow_hiveop_tst/tmptst'
        ]

        op = HiveOperator(task_id='beeline_hql',
                          hive_cli_conn_id='hive_cli_default',
                          hql=self.hql,
                          dag=self.dag,
                          mapred_job_name="test_job_name")
        op.run(start_date=DEFAULT_DATE,
               end_date=DEFAULT_DATE,
               ignore_ti_state=True)
        mock_popen.assert_called_with(hive_cmd,
                                      stdout=mock_subprocess.PIPE,
                                      stderr=mock_subprocess.STDOUT,
                                      cwd="/tmp/airflow_hiveop_tst",
                                      close_fds=True)
Example #3
0
 def setUp(self):
     self._upload_dataframe()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.database = 'airflow'
     self.table = 'hive_server_hook'
     self.hql = """
     CREATE DATABASE IF NOT EXISTS {{ params.database }};
     USE {{ params.database }};
     DROP TABLE IF EXISTS {{ params.table }};
     CREATE TABLE IF NOT EXISTS {{ params.table }} (
         a int,
         b int)
     ROW FORMAT DELIMITED
     FIELDS TERMINATED BY ',';
     LOAD DATA LOCAL INPATH '{{ params.csv_path }}'
     OVERWRITE INTO TABLE {{ params.table }};
     """
     self.columns = ['{}.a'.format(self.table), '{}.b'.format(self.table)]
     self.hook = HiveMetastoreHook()
     op = HiveOperator(task_id='HiveHook_' + str(random.randint(1, 10000)),
                       params={
                           'database': self.database,
                           'table': self.table,
                           'csv_path': self.local_path
                       },
                       hive_cli_conn_id='hive_cli_default',
                       hql=self.hql,
                       dag=self.dag)
     op.run(start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
 def setUp(self):
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.next_day = (DEFAULT_DATE +
                      timedelta(days=1)).isoformat()[:10]
     self.database = 'airflow'
     self.partition_by = 'ds'
     self.table = 'static_babynames_partitioned'
     self.hql = """
             CREATE DATABASE IF NOT EXISTS {{ params.database }};
             USE {{ params.database }};
             DROP TABLE IF EXISTS {{ params.table }};
             CREATE TABLE IF NOT EXISTS {{ params.table }} (
                 state string,
                 year string,
                 name string,
                 gender string,
                 num int)
             PARTITIONED BY ({{ params.partition_by }} string);
             ALTER TABLE {{ params.table }}
             ADD PARTITION({{ params.partition_by }}='{{ ds }}');
             """
     self.hook = HiveMetastoreHook()
     op = HiveOperator(
         task_id='HiveHook_' + str(random.randint(1, 10000)),
         params={
             'database': self.database,
             'table': self.table,
             'partition_by': self.partition_by
         },
         hive_cli_conn_id='hive_cli_default',
         hql=self.hql, dag=self.dag)
     op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
            ignore_ti_state=True)
Example #5
0
    def test_hive_dryrun(self, mock_popen, mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "tst"

        op = HiveOperator(task_id='dry_run_basic_hql',
                          hql=self.hql,
                          dag=self.dag)
        op.dry_run()

        hive_cmd = [
            'beeline', '-u', '"jdbc:hive2://localhost:10000/default"',
            '-hiveconf', 'airflow.ctx.dag_id=', '-hiveconf',
            'airflow.ctx.task_id=', '-hiveconf', 'airflow.ctx.execution_date=',
            '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf',
            'airflow.ctx.dag_owner=', '-hiveconf', 'airflow.ctx.dag_email=',
            '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf',
            'mapred.job.queue.name=airflow', '-hiveconf',
            'tez.queue.name=airflow', '-f', '/tmp/airflow_hiveop_tst/tmptst'
        ]
        mock_popen.assert_called_with(hive_cmd,
                                      stdout=mock_subprocess.PIPE,
                                      stderr=mock_subprocess.STDOUT,
                                      cwd="/tmp/airflow_hiveop_tst",
                                      close_fds=True)
 def test_hive_queues(self):
     op = HiveOperator(
         task_id='test_hive_queues', hql=self.hql,
         mapred_queue='default', mapred_queue_priority='HIGH',
         mapred_job_name='airflow.test_hive_queues',
         dag=self.dag)
     op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE,
            ignore_ti_state=True)
Example #7
0
 def test_beeline(self):
     op = HiveOperator(task_id='beeline_hql',
                       hive_cli_conn_id='hive_cli_default',
                       hql=self.hql,
                       dag=self.dag)
     op.run(start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
 def test_hiveconf(self):
     hql = "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});"
     op = HiveOperator(
         hiveconfs={'table': 'static_babynames', 'day': '{{ ds }}'},
         task_id='dry_run_basic_hql', hql=hql, dag=self.dag)
     op.prepare_template()
     self.assertEqual(
         op.hql,
         "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});")
Example #9
0
    def test_hive_airflow_default_config_queue_override(self):
        specific_mapred_queue = 'default'
        op = HiveOperator(task_id='test_default_config_queue',
                          hql=self.hql,
                          mapred_queue=specific_mapred_queue,
                          mapred_queue_priority='HIGH',
                          mapred_job_name='airflow.test_default_config_queue',
                          dag=self.dag)

        self.assertEqual(op.get_hook().mapred_queue, specific_mapred_queue)
Example #10
0
    def test_hive_airflow_default_config_queue(self):
        op = HiveOperator(task_id='test_default_config_queue',
                          hql=self.hql,
                          mapred_queue_priority='HIGH',
                          mapred_job_name='airflow.test_default_config_queue',
                          dag=self.dag)

        # just check that the correct default value in test_default.cfg is used
        test_config_hive_mapred_queue = conf.get('hive',
                                                 'default_hive_mapred_queue')
        self.assertEqual(op.get_hook().mapred_queue,
                         test_config_hive_mapred_queue)
Example #11
0
    def test_mapred_job_name(self, mock_get_hook):
        mock_hook = mock.MagicMock()
        mock_get_hook.return_value = mock_hook
        op = HiveOperator(task_id='test_mapred_job_name',
                          hql=self.hql,
                          dag=self.dag)

        fake_execution_date = timezone.datetime(2018, 6, 19)
        fake_ti = TaskInstance(task=op, execution_date=fake_execution_date)
        fake_ti.hostname = 'fake_hostname'
        fake_context = {'ti': fake_ti}

        op.execute(fake_context)
        self.assertEqual(
            "Airflow HiveOperator task for {}.{}.{}.{}".format(
                fake_ti.hostname, self.dag.dag_id, op.task_id,
                fake_execution_date.isoformat()), mock_hook.mapred_job_name)
    for channel in to_channels:

        file_name = "to_" + channel + "_" + yesterday.strftime(
            "%Y-%m-%d") + ".csv"

        load_to_hdfs = BashOperator(
            task_id="put_" + channel + "_to_hdfs",
            bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
            local_dir + file_name + hdfs_dir + channel + "/")

        load_to_hdfs << analyze_tweets

        load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive",
                                    hql="LOAD DATA INPATH '" + hdfs_dir +
                                    channel + "/" + file_name + "' "
                                    "INTO TABLE " + channel + " "
                                    "PARTITION(dt='" + dt + "')")
        load_to_hive << load_to_hdfs
        load_to_hive >> hive_to_mysql

    for channel in from_channels:
        file_name = "from_" + channel + "_" + yesterday.strftime(
            "%Y-%m-%d") + ".csv"
        load_to_hdfs = BashOperator(
            task_id="put_" + channel + "_to_hdfs",
            bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
            local_dir + file_name + hdfs_dir + channel + "/")

        load_to_hdfs << analyze_tweets
Example #13
0
    for channel in to_channels:

        file_name = f"to_{channel}_{dt}.csv"

        load_to_hdfs = BashOperator(
            task_id=f"put_{channel}_to_hdfs",
            bash_command=
            (f"HADOOP_USER_NAME=hdfs hadoop fs -put -f {local_dir}{file_name}{hdfs_dir}{channel}/"
             ),
        )

        # [START create_hive]
        load_to_hive = HiveOperator(
            task_id=f"load_{channel}_to_hive",
            hql=(f"LOAD DATA INPATH '{hdfs_dir}{channel}/{file_name}'"
                 f"INTO TABLE {channel}"
                 f"PARTITION(dt='{dt}')"),
        )
        # [END create_hive]

        analyze >> load_to_hdfs >> load_to_hive >> hive_to_mysql

    for channel in from_channels:
        file_name = f"from_{channel}_{dt}.csv"
        load_to_hdfs = BashOperator(
            task_id=f"put_{channel}_to_hdfs",
            bash_command=
            (f"HADOOP_USER_NAME=hdfs hadoop fs -put -f {local_dir}{file_name}{hdfs_dir}{channel}/"
             ),
        )
        bash_command="""
            hdfs dfs -mkdir -p /forex && \
            hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex
        """
    )

    creating_forex_rates_table = HiveOperator(
        task_id="creating_forex_rates_table",
        hive_cli_conn_id="hive_default",
        hql="""
            CREATE EXTERNAL TABLE IF NOT EXISTS forex_rates(
                base STRING,
                last_update DATE,
                eur DOUBLE,
                usd DOUBLE,
                nzd DOUBLE,
                gbp DOUBLE,
                jpy DOUBLE,
                cad DOUBLE
                )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS TEXTFILE
        """
    )

    forex_processing = SparkSubmitOperator(
        task_id='forex_processing',
        conn_id='spark_conn',
        application="/User/arunraja/airflow/dags/scripts/forex_processing.py",
        verbose=False,
Example #15
0
 create_hive_table = HiveOperator(task_id='create_hive_table',
                                  hive_cli_conn_id='hive_conn',
                                  hql="""
         CREATE EXTERNAL TABLE IF NOT EXISTS cov_data(
             iso_code STRING,
             continent STRING,
             location STRING,
             `date` STRING,
             total_cases BIGINT,
             new_cases BIGINT,
             new_cases_smoothed FLOAT,
             total_deaths BIGINT,
             new_deaths BIGINT,
             new_deaths_smoothed INT,
             total_cases_per_million BIGINT,
             new_cases_per_million FLOAT,
             new_cases_smoothed_per_million FLOAT,
             total_deaths_per_million FLOAT,
             new_deaths_per_million FLOAT,
             new_deaths_smoothed_per_million FLOAT,
             reproduction_rate FLOAT,
             icu_patients INT,
             icu_patients_per_million FLOAT,
             hosp_patients BIGINT,
             hosp_patients_per_million FLOAT,
             weekly_icu_admissions INT,
             weekly_icu_admissions_per_million FLOAT,
             weekly_hosp_admissions BIGINT,
             weekly_hosp_admissions_per_million FLOAT,
             total_tests BIGINT,
             new_tests BIGINT,
             total_tests_per_thousand FLOAT,
             new_tests_per_thousand FLOAT,
             new_tests_smoothed BIGINT,
             new_tests_smoothed_per_thousand FLOAT,
             positive_rate FLOAT,
             tests_per_case BIGINT,
             tests_units STRING,
             total_vaccinations BIGINT,
             people_vaccinated BIGINT,
             people_fully_vaccinated BIGINT,
             total_boosters BIGINT,
             new_vaccinations BIGINT,
             new_vaccinations_smoothed BIGINT,
             total_vaccinations_per_hundred INT,
             people_vaccinated_per_hundred INT,
             people_fully_vaccinated_per_hundred INT,
             total_boosters_per_hundred INT,
             new_vaccinations_smoothed_per_million INT,
             new_people_vaccinated_smoothed BIGINT,
             new_people_vaccinated_smoothed_per_hundred FLOAT,
             stringency_index INT,
             population BIGINT,
             population_density BIGINT,
             median_age FLOAT,
             aged_65_older FLOAT,
             aged_70_older FLOAT,
             gdp_per_capita BIGINT,
             extreme_poverty INT,
             cardiovasc_death_rate FLOAT,
             diabetes_prevalence FLOAT,
             female_smokers FLOAT,
             male_smokers FLOAT,
             handwashing_facilities INT,
             hospital_beds_per_thousand INT,
             life_expectancy FLOAT,
             human_development_index FLOAT,
             excess_mortality_cumulative_absolute BIGINT,
             excess_mortality_cumulative FLOAT,
             excess_mortality INT,
             excess_mortality_cumulative_per_million BIGINT 
             )
         COMMENT 'Main Table'
         ROW FORMAT DELIMITED
         FIELDS TERMINATED BY ','
         TBLPROPERTIES ("skip.header.line.count"="1");
     """)
Example #16
0
 def test_hive(self):
     op = HiveOperator(task_id='basic_hql', hql=self.hql, dag=self.dag)
     op.run(start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
Example #17
0
 def test_hive_dryrun(self):
     op = HiveOperator(task_id='dry_run_basic_hql',
                       hql=self.hql,
                       dag=self.dag)
     op.dry_run()