def test_hiveconf_jinja_translate(self): hql = "SELECT ${num_col} FROM ${hiveconf:table};" op = HiveOperator( hiveconf_jinja_translate=True, task_id='dry_run_basic_hql', hql=hql, dag=self.dag) op.prepare_template() self.assertEqual(op.hql, "SELECT {{ num_col }} FROM {{ table }};")
def test_beeline(self, mock_popen, mock_temp_dir): mock_subprocess = MockSubProcess() mock_popen.return_value = mock_subprocess mock_temp_dir.return_value = "tst" hive_cmd = [ 'beeline', '-u', '"jdbc:hive2://localhost:10000/default"', '-hiveconf', 'airflow.ctx.dag_id=test_dag_id', '-hiveconf', 'airflow.ctx.task_id=beeline_hql', '-hiveconf', 'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00', '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf', 'airflow.ctx.dag_owner=airflow', '-hiveconf', 'airflow.ctx.dag_email=', '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf', 'mapred.job.queue.name=airflow', '-hiveconf', 'tez.queue.name=airflow', '-hiveconf', 'mapred.job.name=test_job_name', '-f', '/tmp/airflow_hiveop_tst/tmptst' ] op = HiveOperator(task_id='beeline_hql', hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag, mapred_job_name="test_job_name") op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) mock_popen.assert_called_with(hive_cmd, stdout=mock_subprocess.PIPE, stderr=mock_subprocess.STDOUT, cwd="/tmp/airflow_hiveop_tst", close_fds=True)
def setUp(self): self._upload_dataframe() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.database = 'airflow' self.table = 'hive_server_hook' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( a int, b int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; LOAD DATA LOCAL INPATH '{{ params.csv_path }}' OVERWRITE INTO TABLE {{ params.table }}; """ self.columns = ['{}.a'.format(self.table), '{}.b'.format(self.table)] self.hook = HiveMetastoreHook() op = HiveOperator(task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'csv_path': self.local_path }, hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.next_day = (DEFAULT_DATE + timedelta(days=1)).isoformat()[:10] self.database = 'airflow' self.partition_by = 'ds' self.table = 'static_babynames_partitioned' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( state string, year string, name string, gender string, num int) PARTITIONED BY ({{ params.partition_by }} string); ALTER TABLE {{ params.table }} ADD PARTITION({{ params.partition_by }}='{{ ds }}'); """ self.hook = HiveMetastoreHook() op = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'partition_by': self.partition_by }, hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive_dryrun(self, mock_popen, mock_temp_dir): mock_subprocess = MockSubProcess() mock_popen.return_value = mock_subprocess mock_temp_dir.return_value = "tst" op = HiveOperator(task_id='dry_run_basic_hql', hql=self.hql, dag=self.dag) op.dry_run() hive_cmd = [ 'beeline', '-u', '"jdbc:hive2://localhost:10000/default"', '-hiveconf', 'airflow.ctx.dag_id=', '-hiveconf', 'airflow.ctx.task_id=', '-hiveconf', 'airflow.ctx.execution_date=', '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf', 'airflow.ctx.dag_owner=', '-hiveconf', 'airflow.ctx.dag_email=', '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf', 'mapred.job.queue.name=airflow', '-hiveconf', 'tez.queue.name=airflow', '-f', '/tmp/airflow_hiveop_tst/tmptst' ] mock_popen.assert_called_with(hive_cmd, stdout=mock_subprocess.PIPE, stderr=mock_subprocess.STDOUT, cwd="/tmp/airflow_hiveop_tst", close_fds=True)
def test_hive_queues(self): op = HiveOperator( task_id='test_hive_queues', hql=self.hql, mapred_queue='default', mapred_queue_priority='HIGH', mapred_job_name='airflow.test_hive_queues', dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_beeline(self): op = HiveOperator(task_id='beeline_hql', hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hiveconf(self): hql = "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});" op = HiveOperator( hiveconfs={'table': 'static_babynames', 'day': '{{ ds }}'}, task_id='dry_run_basic_hql', hql=hql, dag=self.dag) op.prepare_template() self.assertEqual( op.hql, "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});")
def test_hive_airflow_default_config_queue_override(self): specific_mapred_queue = 'default' op = HiveOperator(task_id='test_default_config_queue', hql=self.hql, mapred_queue=specific_mapred_queue, mapred_queue_priority='HIGH', mapred_job_name='airflow.test_default_config_queue', dag=self.dag) self.assertEqual(op.get_hook().mapred_queue, specific_mapred_queue)
def test_hive_airflow_default_config_queue(self): op = HiveOperator(task_id='test_default_config_queue', hql=self.hql, mapred_queue_priority='HIGH', mapred_job_name='airflow.test_default_config_queue', dag=self.dag) # just check that the correct default value in test_default.cfg is used test_config_hive_mapred_queue = conf.get('hive', 'default_hive_mapred_queue') self.assertEqual(op.get_hook().mapred_queue, test_config_hive_mapred_queue)
def test_mapred_job_name(self, mock_get_hook): mock_hook = mock.MagicMock() mock_get_hook.return_value = mock_hook op = HiveOperator(task_id='test_mapred_job_name', hql=self.hql, dag=self.dag) fake_execution_date = timezone.datetime(2018, 6, 19) fake_ti = TaskInstance(task=op, execution_date=fake_execution_date) fake_ti.hostname = 'fake_hostname' fake_context = {'ti': fake_ti} op.execute(fake_context) self.assertEqual( "Airflow HiveOperator task for {}.{}.{}.{}".format( fake_ti.hostname, self.dag.dag_id, op.task_id, fake_execution_date.isoformat()), mock_hook.mapred_job_name)
for channel in to_channels: file_name = "to_" + channel + "_" + yesterday.strftime( "%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/") load_to_hdfs << analyze_tweets load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')") load_to_hive << load_to_hdfs load_to_hive >> hive_to_mysql for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime( "%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/") load_to_hdfs << analyze_tweets
for channel in to_channels: file_name = f"to_{channel}_{dt}.csv" load_to_hdfs = BashOperator( task_id=f"put_{channel}_to_hdfs", bash_command= (f"HADOOP_USER_NAME=hdfs hadoop fs -put -f {local_dir}{file_name}{hdfs_dir}{channel}/" ), ) # [START create_hive] load_to_hive = HiveOperator( task_id=f"load_{channel}_to_hive", hql=(f"LOAD DATA INPATH '{hdfs_dir}{channel}/{file_name}'" f"INTO TABLE {channel}" f"PARTITION(dt='{dt}')"), ) # [END create_hive] analyze >> load_to_hdfs >> load_to_hive >> hive_to_mysql for channel in from_channels: file_name = f"from_{channel}_{dt}.csv" load_to_hdfs = BashOperator( task_id=f"put_{channel}_to_hdfs", bash_command= (f"HADOOP_USER_NAME=hdfs hadoop fs -put -f {local_dir}{file_name}{hdfs_dir}{channel}/" ), )
bash_command=""" hdfs dfs -mkdir -p /forex && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex """ ) creating_forex_rates_table = HiveOperator( task_id="creating_forex_rates_table", hive_cli_conn_id="hive_default", hql=""" CREATE EXTERNAL TABLE IF NOT EXISTS forex_rates( base STRING, last_update DATE, eur DOUBLE, usd DOUBLE, nzd DOUBLE, gbp DOUBLE, jpy DOUBLE, cad DOUBLE ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE """ ) forex_processing = SparkSubmitOperator( task_id='forex_processing', conn_id='spark_conn', application="/User/arunraja/airflow/dags/scripts/forex_processing.py", verbose=False,
create_hive_table = HiveOperator(task_id='create_hive_table', hive_cli_conn_id='hive_conn', hql=""" CREATE EXTERNAL TABLE IF NOT EXISTS cov_data( iso_code STRING, continent STRING, location STRING, `date` STRING, total_cases BIGINT, new_cases BIGINT, new_cases_smoothed FLOAT, total_deaths BIGINT, new_deaths BIGINT, new_deaths_smoothed INT, total_cases_per_million BIGINT, new_cases_per_million FLOAT, new_cases_smoothed_per_million FLOAT, total_deaths_per_million FLOAT, new_deaths_per_million FLOAT, new_deaths_smoothed_per_million FLOAT, reproduction_rate FLOAT, icu_patients INT, icu_patients_per_million FLOAT, hosp_patients BIGINT, hosp_patients_per_million FLOAT, weekly_icu_admissions INT, weekly_icu_admissions_per_million FLOAT, weekly_hosp_admissions BIGINT, weekly_hosp_admissions_per_million FLOAT, total_tests BIGINT, new_tests BIGINT, total_tests_per_thousand FLOAT, new_tests_per_thousand FLOAT, new_tests_smoothed BIGINT, new_tests_smoothed_per_thousand FLOAT, positive_rate FLOAT, tests_per_case BIGINT, tests_units STRING, total_vaccinations BIGINT, people_vaccinated BIGINT, people_fully_vaccinated BIGINT, total_boosters BIGINT, new_vaccinations BIGINT, new_vaccinations_smoothed BIGINT, total_vaccinations_per_hundred INT, people_vaccinated_per_hundred INT, people_fully_vaccinated_per_hundred INT, total_boosters_per_hundred INT, new_vaccinations_smoothed_per_million INT, new_people_vaccinated_smoothed BIGINT, new_people_vaccinated_smoothed_per_hundred FLOAT, stringency_index INT, population BIGINT, population_density BIGINT, median_age FLOAT, aged_65_older FLOAT, aged_70_older FLOAT, gdp_per_capita BIGINT, extreme_poverty INT, cardiovasc_death_rate FLOAT, diabetes_prevalence FLOAT, female_smokers FLOAT, male_smokers FLOAT, handwashing_facilities INT, hospital_beds_per_thousand INT, life_expectancy FLOAT, human_development_index FLOAT, excess_mortality_cumulative_absolute BIGINT, excess_mortality_cumulative FLOAT, excess_mortality INT, excess_mortality_cumulative_per_million BIGINT ) COMMENT 'Main Table' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' TBLPROPERTIES ("skip.header.line.count"="1"); """)
def test_hive(self): op = HiveOperator(task_id='basic_hql', hql=self.hql, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive_dryrun(self): op = HiveOperator(task_id='dry_run_basic_hql', hql=self.hql, dag=self.dag) op.dry_run()