def test_export_cmd(self): """ Tests to verify the hook export command is building correct Sqoop export command. """ hook = SqoopHook() # The subprocess requires an array but we build the cmd by joining on a space cmd = ' '.join( hook._export_cmd( self._config_export['table'], self._config_export['export_dir'], input_null_string=self._config_export['input_null_string'], input_null_non_string=self._config_export['input_null_non_string'], staging_table=self._config_export['staging_table'], clear_staging_table=self._config_export['clear_staging_table'], enclosed_by=self._config_export['enclosed_by'], escaped_by=self._config_export['escaped_by'], input_fields_terminated_by=self._config_export['input_fields_terminated_by'], input_lines_terminated_by=self._config_export['input_lines_terminated_by'], input_optionally_enclosed_by=self._config_export['input_optionally_enclosed_by'], batch=self._config_export['batch'], relaxed_isolation=self._config_export['relaxed_isolation'], extra_export_options=self._config_export['extra_export_options'], ) ) self.assertIn("--input-null-string {}".format(self._config_export['input_null_string']), cmd) self.assertIn("--input-null-non-string {}".format(self._config_export['input_null_non_string']), cmd) self.assertIn("--staging-table {}".format(self._config_export['staging_table']), cmd) self.assertIn("--enclosed-by {}".format(self._config_export['enclosed_by']), cmd) self.assertIn("--escaped-by {}".format(self._config_export['escaped_by']), cmd) self.assertIn( "--input-fields-terminated-by {}".format(self._config_export['input_fields_terminated_by']), cmd ) self.assertIn( "--input-lines-terminated-by {}".format(self._config_export['input_lines_terminated_by']), cmd ) self.assertIn( "--input-optionally-enclosed-by {}".format(self._config_export['input_optionally_enclosed_by']), cmd, ) # these options are from the extra export options self.assertIn("--update-key id", cmd) self.assertIn("--update-mode allowinsert", cmd) if self._config_export['clear_staging_table']: self.assertIn("--clear-staging-table", cmd) if self._config_export['batch']: self.assertIn("--batch", cmd) if self._config_export['relaxed_isolation']: self.assertIn("--relaxed-isolation", cmd) if self._config_export['extra_export_options']: self.assertIn("--update-key", cmd) self.assertIn("--update-mode", cmd) self.assertIn("--fetch-size", cmd)
def test_cmd_mask_password(self): """ Tests to verify the hook masking function will correctly mask a user password in Sqoop command. """ hook = SqoopHook() self.assertEqual(hook.cmd_mask_password(['--password', 'supersecret']), ['--password', 'MASKED']) cmd = ['--target', 'targettable'] self.assertEqual(hook.cmd_mask_password(cmd), cmd)
def test_submit_none_mappers(self): """ Test to check that if value of num_mappers is None, then it shouldn't be in the cmd built. """ _config_without_mappers = self._config.copy() _config_without_mappers['num_mappers'] = None hook = SqoopHook(**_config_without_mappers) cmd = ' '.join(hook._prepare_command()) self.assertNotIn('--num-mappers', cmd)
def test_submit(self): """ Tests to verify that from connection extra option the options are added to the Sqoop command. """ hook = SqoopHook(**self._config) cmd = ' '.join(hook._prepare_command()) # Check if the config has been extracted from the json if self._config_json['namenode']: self.assertIn("-fs {}".format(self._config_json['namenode']), cmd) if self._config_json['job_tracker']: self.assertIn("-jt {}".format(self._config_json['job_tracker']), cmd) if self._config_json['libjars']: self.assertIn("-libjars {}".format(self._config_json['libjars']), cmd) if self._config_json['files']: self.assertIn("-files {}".format(self._config_json['files']), cmd) if self._config_json['archives']: self.assertIn("-archives {}".format(self._config_json['archives']), cmd) self.assertIn( "--hcatalog-database {}".format(self._config['hcatalog_database']), cmd) self.assertIn( "--hcatalog-table {}".format(self._config['hcatalog_table']), cmd) # Check the regulator stuff passed by the default constructor if self._config['verbose']: self.assertIn("--verbose", cmd) if self._config['num_mappers']: self.assertIn( "--num-mappers {}".format(self._config['num_mappers']), cmd) for key, value in self._config['properties'].items(): self.assertIn("-D {}={}".format(key, value), cmd) # We don't have the sqoop binary available, and this is hard to mock, # so just accept an exception for now. with self.assertRaises(OSError): hook.export_table(**self._config_export) with self.assertRaises(OSError): hook.import_table(table='schema.table', target_dir='/sqoop/example/path') with self.assertRaises(OSError): hook.import_query(query='SELECT * FROM sometable', target_dir='/sqoop/example/path')
def test_import_cmd(self): """ Tests to verify the hook import command is building correct Sqoop import command. """ hook = SqoopHook() # The subprocess requires an array but we build the cmd by joining on a space cmd = ' '.join( hook._import_cmd( self._config_import['target_dir'], append=self._config_import['append'], file_type=self._config_import['file_type'], split_by=self._config_import['split_by'], direct=self._config_import['direct'], driver=self._config_import['driver'], extra_import_options=None, )) if self._config_import['append']: self.assertIn('--append', cmd) if self._config_import['direct']: self.assertIn('--direct', cmd) self.assertIn( '--target-dir {}'.format(self._config_import['target_dir']), cmd) self.assertIn('--driver {}'.format(self._config_import['driver']), cmd) self.assertIn('--split-by {}'.format(self._config_import['split_by']), cmd) # these are from extra options, but not passed to this cmd import command self.assertNotIn('--show', cmd) self.assertNotIn('hcatalog-storage-stanza \"stored as orcfile\"', cmd) cmd = ' '.join( hook._import_cmd( target_dir=None, append=self._config_import['append'], file_type=self._config_import['file_type'], split_by=self._config_import['split_by'], direct=self._config_import['direct'], driver=self._config_import['driver'], extra_import_options=self. _config_import['extra_import_options'], )) self.assertNotIn('--target-dir', cmd) # these checks are from the extra import options self.assertIn('--show', cmd) self.assertIn('hcatalog-storage-stanza \"stored as orcfile\"', cmd) self.assertIn('--fetch-size', cmd)
def _get_hook(self) -> SqoopHook: return SqoopHook(conn_id=self.conn_id, verbose=self.verbose, num_mappers=self.num_mappers, hcatalog_database=self.hcatalog_database, hcatalog_table=self.hcatalog_table, properties=self.properties)
def test_get_export_format_argument(self): """ Tests to verify the hook get format function is building correct Sqoop command with correct format type. """ hook = SqoopHook() self.assertIn("--as-avrodatafile", hook._get_export_format_argument('avro')) self.assertIn("--as-parquetfile", hook._get_export_format_argument('parquet')) self.assertIn("--as-sequencefile", hook._get_export_format_argument('sequence')) self.assertIn("--as-textfile", hook._get_export_format_argument('text')) with self.assertRaises(AirflowException): hook._get_export_format_argument('unknown')
def test_popen(self, mock_popen): # Given mock_popen.return_value.stdout = StringIO('stdout') mock_popen.return_value.stderr = StringIO('stderr') mock_popen.return_value.returncode = 0 mock_popen.return_value.communicate.return_value = \ [StringIO('stdout\nstdout'), StringIO('stderr\nstderr')] # When hook = SqoopHook(conn_id='sqoop_test') hook.export_table(**self._config_export) # Then self.assertEqual( mock_popen.mock_calls[0], call([ 'sqoop', 'export', '-fs', self._config_json['namenode'], '-jt', self._config_json['job_tracker'], '-libjars', self._config_json['libjars'], '-files', self._config_json['files'], '-archives', self._config_json['archives'], '--connect', 'rmdbs:5050/schema', '--input-null-string', self._config_export['input_null_string'], '--input-null-non-string', self._config_export['input_null_non_string'], '--staging-table', self._config_export['staging_table'], '--clear-staging-table', '--enclosed-by', self._config_export['enclosed_by'], '--escaped-by', self._config_export['escaped_by'], '--input-fields-terminated-by', self._config_export['input_fields_terminated_by'], '--input-lines-terminated-by', self._config_export['input_lines_terminated_by'], '--input-optionally-enclosed-by', self._config_export['input_optionally_enclosed_by'], '--batch', '--relaxed-isolation', '--export-dir', self._config_export['export_dir'], '--update-key', 'id', '--update-mode', 'allowinsert', '--fetch-size', str(self._config_export['extra_export_options'].get( 'fetch-size')), '--table', self._config_export['table'] ], stderr=-2, stdout=-1))
class SqoopOperator(BaseOperator): """ Execute a Sqoop job. Documentation for Apache Sqoop can be found here: https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide.html :param conn_id: str :param cmd_type: str specify command to execute "export" or "import" :param table: Table to read :param query: Import result of arbitrary SQL query. Instead of using the table, columns and where arguments, you can specify a SQL statement with the query argument. Must also specify a destination directory with target_dir. :param target_dir: HDFS destination directory where the data from the rdbms will be written :param append: Append data to an existing dataset in HDFS :param file_type: "avro", "sequence", "text" Imports data to into the specified format. Defaults to text. :param columns: <col,col,col> Columns to import from table :param num_mappers: Use n mapper tasks to import/export in parallel :param split_by: Column of the table used to split work units :param where: WHERE clause to use during import :param export_dir: HDFS Hive database directory to export to the rdbms :param input_null_string: The string to be interpreted as null for string columns :param input_null_non_string: The string to be interpreted as null for non-string columns :param staging_table: The table in which data will be staged before being inserted into the destination table :param clear_staging_table: Indicate that any data present in the staging table can be deleted :param enclosed_by: Sets a required field enclosing character :param escaped_by: Sets the escape character :param input_fields_terminated_by: Sets the input field separator :param input_lines_terminated_by: Sets the input end-of-line character :param input_optionally_enclosed_by: Sets a field enclosing character :param batch: Use batch mode for underlying statement execution :param direct: Use direct export fast path :param driver: Manually specify JDBC driver class to use :param verbose: Switch to more verbose logging for debug purposes :param relaxed_isolation: use read uncommitted isolation level :param hcatalog_database: Specifies the database name for the HCatalog table :param hcatalog_table: The argument value for this option is the HCatalog table :param create_hcatalog_table: Have sqoop create the hcatalog table passed in or not :param properties: additional JVM properties passed to sqoop :param extra_import_options: Extra import options to pass as dict. If a key doesn't have a value, just pass an empty string to it. Don't include prefix of -- for sqoop options. :param extra_export_options: Extra export options to pass as dict. If a key doesn't have a value, just pass an empty string to it. Don't include prefix of -- for sqoop options. """ template_fields = ( 'conn_id', 'cmd_type', 'table', 'query', 'target_dir', 'file_type', 'columns', 'split_by', 'where', 'export_dir', 'input_null_string', 'input_null_non_string', 'staging_table', 'enclosed_by', 'escaped_by', 'input_fields_terminated_by', 'input_lines_terminated_by', 'input_optionally_enclosed_by', 'properties', 'extra_import_options', 'driver', 'extra_export_options', 'hcatalog_database', 'hcatalog_table', ) ui_color = '#7D8CA4' @apply_defaults def __init__(self, conn_id='sqoop_default', cmd_type='import', table=None, query=None, target_dir=None, append=None, file_type='text', columns=None, num_mappers=None, split_by=None, where=None, export_dir=None, input_null_string=None, input_null_non_string=None, staging_table=None, clear_staging_table=False, enclosed_by=None, escaped_by=None, input_fields_terminated_by=None, input_lines_terminated_by=None, input_optionally_enclosed_by=None, batch=False, direct=False, driver=None, verbose=False, relaxed_isolation=False, properties=None, hcatalog_database=None, hcatalog_table=None, create_hcatalog_table=False, extra_import_options=None, extra_export_options=None, *args, **kwargs): super().__init__(*args, **kwargs) self.conn_id = conn_id self.cmd_type = cmd_type self.table = table self.query = query self.target_dir = target_dir self.append = append self.file_type = file_type self.columns = columns self.num_mappers = num_mappers self.split_by = split_by self.where = where self.export_dir = export_dir self.input_null_string = input_null_string self.input_null_non_string = input_null_non_string self.staging_table = staging_table self.clear_staging_table = clear_staging_table self.enclosed_by = enclosed_by self.escaped_by = escaped_by self.input_fields_terminated_by = input_fields_terminated_by self.input_lines_terminated_by = input_lines_terminated_by self.input_optionally_enclosed_by = input_optionally_enclosed_by self.batch = batch self.direct = direct self.driver = driver self.verbose = verbose self.relaxed_isolation = relaxed_isolation self.hcatalog_database = hcatalog_database self.hcatalog_table = hcatalog_table self.create_hcatalog_table = create_hcatalog_table self.properties = properties self.extra_import_options = extra_import_options or {} self.extra_export_options = extra_export_options or {} def execute(self, context): """ Execute sqoop job """ self.hook = SqoopHook(conn_id=self.conn_id, verbose=self.verbose, num_mappers=self.num_mappers, hcatalog_database=self.hcatalog_database, hcatalog_table=self.hcatalog_table, properties=self.properties) if self.cmd_type == 'export': self.hook.export_table( table=self.table, export_dir=self.export_dir, input_null_string=self.input_null_string, input_null_non_string=self.input_null_non_string, staging_table=self.staging_table, clear_staging_table=self.clear_staging_table, enclosed_by=self.enclosed_by, escaped_by=self.escaped_by, input_fields_terminated_by=self.input_fields_terminated_by, input_lines_terminated_by=self.input_lines_terminated_by, input_optionally_enclosed_by=self.input_optionally_enclosed_by, batch=self.batch, relaxed_isolation=self.relaxed_isolation, extra_export_options=self.extra_export_options) elif self.cmd_type == 'import': # add create hcatalog table to extra import options if option passed # if new params are added to constructor can pass them in here # so don't modify sqoop_hook for each param if self.create_hcatalog_table: self.extra_import_options['create-hcatalog-table'] = '' if self.table and self.query: raise AirflowException( 'Cannot specify query and table together. Need to specify either or.' ) if self.table: self.hook.import_table( table=self.table, target_dir=self.target_dir, append=self.append, file_type=self.file_type, columns=self.columns, split_by=self.split_by, where=self.where, direct=self.direct, driver=self.driver, extra_import_options=self.extra_import_options) elif self.query: self.hook.import_query( query=self.query, target_dir=self.target_dir, append=self.append, file_type=self.file_type, split_by=self.split_by, direct=self.direct, driver=self.driver, extra_import_options=self.extra_import_options) else: raise AirflowException( "Provide query or table parameter to import using Sqoop") else: raise AirflowException("cmd_type should be 'import' or 'export'") def on_kill(self): self.log.info('Sending SIGTERM signal to bash process group') os.killpg(os.getpgid(self.hook.sp.pid), signal.SIGTERM)
def execute(self, context): """ Execute sqoop job """ self.hook = SqoopHook(conn_id=self.conn_id, verbose=self.verbose, num_mappers=self.num_mappers, hcatalog_database=self.hcatalog_database, hcatalog_table=self.hcatalog_table, properties=self.properties) if self.cmd_type == 'export': self.hook.export_table( table=self.table, export_dir=self.export_dir, input_null_string=self.input_null_string, input_null_non_string=self.input_null_non_string, staging_table=self.staging_table, clear_staging_table=self.clear_staging_table, enclosed_by=self.enclosed_by, escaped_by=self.escaped_by, input_fields_terminated_by=self.input_fields_terminated_by, input_lines_terminated_by=self.input_lines_terminated_by, input_optionally_enclosed_by=self.input_optionally_enclosed_by, batch=self.batch, relaxed_isolation=self.relaxed_isolation, extra_export_options=self.extra_export_options) elif self.cmd_type == 'import': # add create hcatalog table to extra import options if option passed # if new params are added to constructor can pass them in here # so don't modify sqoop_hook for each param if self.create_hcatalog_table: self.extra_import_options['create-hcatalog-table'] = '' if self.table and self.query: raise AirflowException( 'Cannot specify query and table together. Need to specify either or.' ) if self.table: self.hook.import_table( table=self.table, target_dir=self.target_dir, append=self.append, file_type=self.file_type, columns=self.columns, split_by=self.split_by, where=self.where, direct=self.direct, driver=self.driver, extra_import_options=self.extra_import_options) elif self.query: self.hook.import_query( query=self.query, target_dir=self.target_dir, append=self.append, file_type=self.file_type, split_by=self.split_by, direct=self.direct, driver=self.driver, extra_import_options=self.extra_import_options) else: raise AirflowException( "Provide query or table parameter to import using Sqoop") else: raise AirflowException("cmd_type should be 'import' or 'export'")