def setUp(self): self._upload_dataframe() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.database = 'airflow' self.table = 'hive_server_hook' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( a int, b int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; LOAD DATA LOCAL INPATH '{{ params.csv_path }}' OVERWRITE INTO TABLE {{ params.table }}; """ self.columns = ['{}.a'.format(self.table), '{}.b'.format(self.table)] self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'csv_path': self.local_path }, hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.next_day = (DEFAULT_DATE + datetime.timedelta(days=1)).isoformat()[:10] self.database = 'airflow' self.partition_by = 'ds' self.table = 'static_babynames_partitioned' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( state string, year string, name string, gender string, num int) PARTITIONED BY ({{ params.partition_by }} string); ALTER TABLE {{ params.table }} ADD PARTITION({{ params.partition_by }}='{{ ds }}'); """ self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'partition_by': self.partition_by }, hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def setUp(self): configuration.load_test_config() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.next_day = (DEFAULT_DATE + datetime.timedelta(days=1)).isoformat()[:10] self.database = 'airflow' self.partition_by = 'ds' self.table = 'static_babynames_partitioned' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( state string, year string, name string, gender string, num int) PARTITIONED BY ({{ params.partition_by }} string); ALTER TABLE {{ params.table }} ADD PARTITION({{ params.partition_by }}='{{ ds }}'); """ self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'partition_by': self.partition_by }, hive_cli_conn_id='beeline_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_beeline(self, mock_popen, mock_temp_dir): mock_subprocess = MockSubProcess() mock_popen.return_value = mock_subprocess mock_temp_dir.return_value = "tst" hive_cmd = [ 'beeline', '-u', '"jdbc:hive2://localhost:10000/default"', '-hiveconf', 'airflow.ctx.dag_id=test_dag_id', '-hiveconf', 'airflow.ctx.task_id=beeline_hql', '-hiveconf', 'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00', '-hiveconf', 'airflow.ctx.dag_run_id=', '-hiveconf', 'airflow.ctx.dag_owner=airflow', '-hiveconf', 'airflow.ctx.dag_email=', '-hiveconf', 'mapreduce.job.queuename=airflow', '-hiveconf', 'mapred.job.queue.name=airflow', '-hiveconf', 'tez.queue.name=airflow', '-hiveconf', 'mapred.job.name=test_job_name', '-f', '/tmp/airflow_hiveop_tst/tmptst' ] op = HiveOperator(task_id='beeline_hql', hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag, mapred_job_name="test_job_name") op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) mock_popen.assert_called_with(hive_cmd, stdout=mock_subprocess.PIPE, stderr=mock_subprocess.STDOUT, cwd="/tmp/airflow_hiveop_tst", close_fds=True)
def setUp(self): configuration.load_test_config() self._upload_dataframe() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.database = 'airflow' self.table = 'hive_server_hook' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( a int, b int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; LOAD DATA LOCAL INPATH '{{ params.csv_path }}' OVERWRITE INTO TABLE {{ params.table }}; """ self.columns = ['{}.a'.format(self.table), '{}.b'.format(self.table)] self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'csv_path': self.local_path }, hive_cli_conn_id='beeline_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_beeline(self): t = HiveOperator(task_id='beeline_hql', hive_cli_conn_id='hive_cli_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive_queues(self): t = HiveOperator( task_id='test_hive_queues', hql=self.hql, mapred_queue='default', mapred_queue_priority='HIGH', mapred_job_name='airflow.test_hive_queues', dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive_queues(self): t = HiveOperator( task_id='test_hive_queues', hql=self.hql, mapred_queue='default', mapred_queue_priority='HIGH', mapred_job_name='airflow.test_hive_queues', dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive(self): t = HiveOperator(task_id='basic_hql', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_beeline(self): t = HiveOperator( task_id='beeline_hql', hive_cli_conn_id='beeline_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive(self): t = HiveOperator( task_id='basic_hql', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)