from airflow.providers.apache.sqoop.operators.sqoop import SqoopOperator from airflow.operators.email import EmailOperator from datetime import datetime, timedelta import time import requests import json #Define default arguments for dag process default_args = { "owner": "airflow", "email_on_failure": True, "email_on_retry": True, "email": "*****@*****.**", "retry": 1, "retry_delay": timedelta(minutes=5) } with DAG("twitter", start_date=datetime(2021, 1, 1), schedule_interval="*/10 * * * *", default_args=default_args, catchup=False) as dag: #Sqoop Operator hive_to_mysql = SqoopOperator(task_id="hive_to_mysql", conn_id="sqoop_conn", cmd_type="export", table="twitter", hcatalog_table="twitter")
def test_execute(self): """ Tests to verify values of the SqoopOperator match that passed in from the config. """ operator = SqoopOperator(task_id='sqoop_job', dag=self.dag, **self._config) self.assertEqual(self._config['conn_id'], operator.conn_id) self.assertEqual(self._config['query'], operator.query) self.assertEqual(self._config['cmd_type'], operator.cmd_type) self.assertEqual(self._config['table'], operator.table) self.assertEqual(self._config['target_dir'], operator.target_dir) self.assertEqual(self._config['append'], operator.append) self.assertEqual(self._config['file_type'], operator.file_type) self.assertEqual(self._config['num_mappers'], operator.num_mappers) self.assertEqual(self._config['split_by'], operator.split_by) self.assertEqual(self._config['input_null_string'], operator.input_null_string) self.assertEqual(self._config['input_null_non_string'], operator.input_null_non_string) self.assertEqual(self._config['staging_table'], operator.staging_table) self.assertEqual(self._config['clear_staging_table'], operator.clear_staging_table) self.assertEqual(self._config['batch'], operator.batch) self.assertEqual(self._config['relaxed_isolation'], operator.relaxed_isolation) self.assertEqual(self._config['direct'], operator.direct) self.assertEqual(self._config['driver'], operator.driver) self.assertEqual(self._config['properties'], operator.properties) self.assertEqual(self._config['hcatalog_database'], operator.hcatalog_database) self.assertEqual(self._config['hcatalog_table'], operator.hcatalog_table) self.assertEqual(self._config['create_hcatalog_table'], operator.create_hcatalog_table) self.assertEqual(self._config['extra_import_options'], operator.extra_import_options) self.assertEqual(self._config['extra_export_options'], operator.extra_export_options) # the following are meant to be more of examples SqoopOperator( task_id='sqoop_import_using_table', cmd_type='import', conn_id='sqoop_default', table='company', verbose=True, num_mappers=8, hcatalog_database='default', hcatalog_table='import_table_1', create_hcatalog_table=True, extra_import_options={ 'hcatalog-storage-stanza': "\"stored as orcfile\"" }, dag=self.dag, ) SqoopOperator( task_id='sqoop_import_using_query', cmd_type='import', conn_id='sqoop_default', query='select name, age from company where $CONDITIONS', split_by='age', # the mappers will pass in values to the $CONDITIONS based on the field you select to split by verbose=True, num_mappers=None, hcatalog_database='default', hcatalog_table='import_table_2', create_hcatalog_table=True, extra_import_options={ 'hcatalog-storage-stanza': "\"stored as orcfile\"" }, dag=self.dag, ) SqoopOperator( task_id='sqoop_import_with_partition', cmd_type='import', conn_id='sqoop_default', table='company', verbose=True, num_mappers=None, hcatalog_database='default', hcatalog_table='import_table_3', create_hcatalog_table=True, extra_import_options={ 'hcatalog-storage-stanza': "\"stored as orcfile\"", 'hive-partition-key': 'day', 'hive-partition-value': '2017-10-18', 'fetch-size': 1, }, dag=self.dag, ) SqoopOperator( task_id='sqoop_export_tablename', cmd_type='export', conn_id='sqoop_default', table='rbdms_export_table_1', verbose=True, num_mappers=None, hcatalog_database='default', hcatalog_table='hive_export_table_1', extra_export_options=None, dag=self.dag, ) SqoopOperator( task_id='sqoop_export_tablepath', cmd_type='export', conn_id='sqoop_default', table='rbdms_export_table_2', export_dir='/user/hive/warehouse/export_table_2', direct=True, # speeds up for data transfer verbose=True, num_mappers=None, extra_export_options=None, dag=self.dag, )