def test__get_top_row_as_list(self): self.kwargs['delimiter'] = '\t' fn_txt = self._get_fn('.txt', True) header_list = S3ToHiveTransfer(**self.kwargs).\ _get_top_row_as_list(fn_txt) self.assertEqual(header_list, ['Sno', 'Some,Text'], msg="Top row from file doesnt matched expected value") self.kwargs['delimiter'] = ',' header_list = S3ToHiveTransfer(**self.kwargs).\ _get_top_row_as_list(fn_txt) self.assertEqual(header_list, ['Sno\tSome', 'Text'], msg="Top row from file doesnt matched expected value")
def test__match_headers(self): self.kwargs['field_dict'] = OrderedDict([('Sno', 'BIGINT'), ('Some,Text', 'STRING')]) self.assertTrue(S3ToHiveTransfer(**self.kwargs)._match_headers( ['Sno', 'Some,Text']), msg="Header row doesnt match expected value") # Testing with different column order self.assertFalse(S3ToHiveTransfer(**self.kwargs)._match_headers( ['Some,Text', 'Sno']), msg="Header row doesnt match expected value") # Testing with extra column in header self.assertFalse(S3ToHiveTransfer(**self.kwargs)._match_headers( ['Sno', 'Some,Text', 'ExtraColumn']), msg="Header row doesnt match expected value")
def test_execute(self, mock_hiveclihook): conn = boto3.client('s3') conn.create_bucket(Bucket='bucket') # Testing txt, zip, bz2 files with and without header row for (ext, has_header) in product(['.txt', '.gz', '.bz2'], [True, False]): self.kwargs['headers'] = has_header self.kwargs['check_headers'] = has_header logging.info("Testing {0} format {1} header".format( ext, ('with' if has_header else 'without'))) self.kwargs['input_compressed'] = ext != '.txt' self.kwargs['s3_key'] = 's3://bucket/' + self.s3_key + ext ip_fn = self._get_fn(ext, self.kwargs['headers']) op_fn = self._get_fn(ext, False) # Upload the file into the Mocked S3 bucket conn.upload_file(ip_fn, 'bucket', self.s3_key + ext) # file parameter to HiveCliHook.load_file is compared # against expected file output mock_hiveclihook().load_file.side_effect = \ lambda *args, **kwargs: \ self.assertTrue( self._check_file_equality(args[0], op_fn, ext), msg='{0} output file not as expected'.format(ext)) # Execute S3ToHiveTransfer s32hive = S3ToHiveTransfer(**self.kwargs) s32hive.execute(None)
def test_execute(self, mock_s3hook, mock_hiveclihook): # Testing txt, zip, bz2 files with and without header row for test in product(['.txt', '.gz', '.bz2'], [True, False]): ext = test[0] has_header = test[1] self.kwargs['headers'] = has_header self.kwargs['check_headers'] = has_header logging.info("Testing {0} format {1} header".format( ext, ('with' if has_header else 'without'))) self.kwargs['input_compressed'] = (False if ext == '.txt' else True) self.kwargs['s3_key'] = self.s3_key + ext ip_fn = self._get_fn(ext, self.kwargs['headers']) op_fn = self._get_fn(ext, False) # Mock s3 object returned by S3Hook mock_s3_object = mock.Mock(key=self.kwargs['s3_key']) mock_s3_object.get_contents_to_file.side_effect = \ lambda dest_file: \ self._cp_file_contents(ip_fn, dest_file.name) mock_s3hook().get_key.return_value = mock_s3_object # file paramter to HiveCliHook.load_file is compared # against expected file oputput mock_hiveclihook().load_file.side_effect = \ lambda *args, **kwargs: \ self.assertTrue( self._check_file_equality(args[0], op_fn, ext ), msg='{0} output file not as expected'.format(ext)) # Execute S3ToHiveTransfer s32hive = S3ToHiveTransfer(**self.kwargs) s32hive.execute(None)
def test__delete_top_row_and_compress(self): s32hive = S3ToHiveTransfer(**self.kwargs) # Testing gz file type fn_txt = self._get_fn('.txt', True) gz_txt_nh = s32hive._delete_top_row_and_compress( fn_txt, '.gz', self.tmp_dir) fn_gz = self._get_fn('.gz', False) self.assertTrue(self._check_file_equality(gz_txt_nh, fn_gz, '.gz'), msg="gz Compressed file not as expected") # Testing bz2 file type bz2_txt_nh = s32hive._delete_top_row_and_compress( fn_txt, '.bz2', self.tmp_dir) fn_bz2 = self._get_fn('.bz2', False) self.assertTrue(self._check_file_equality(bz2_txt_nh, fn_bz2, '.bz2'), msg="bz2 Compressed file not as expected")
def test_execute_with_select_expression(self, mock_hiveclihook): conn = boto3.client('s3') conn.create_bucket(Bucket='bucket') select_expression = "SELECT * FROM S3Object s" bucket = 'bucket' # Only testing S3ToHiveTransfer calls S3Hook.select_key with # the right parameters and its execute method succeeds here, # since Moto doesn't support select_object_content as of 1.3.2. for (ext, has_header) in product(['.txt', '.gz', '.GZ'], [True, False]): input_compressed = ext.lower() != '.txt' key = self.s3_key + ext self.kwargs['check_headers'] = False self.kwargs['headers'] = has_header self.kwargs['input_compressed'] = input_compressed self.kwargs['select_expression'] = select_expression self.kwargs['s3_key'] = 's3://{0}/{1}'.format(bucket, key) ip_fn = self._get_fn(ext, has_header) # Upload the file into the Mocked S3 bucket conn.upload_file(ip_fn, bucket, key) input_serialization = {'CSV': {'FieldDelimiter': self.delimiter}} if input_compressed: input_serialization['CompressionType'] = 'GZIP' if has_header: input_serialization['CSV']['FileHeaderInfo'] = 'USE' # Confirm that select_key was called with the right params with mock.patch( 'airflow.providers.amazon.aws.hooks.s3.S3Hook.select_key', return_value="") as mock_select_key: # Execute S3ToHiveTransfer s32hive = S3ToHiveTransfer(**self.kwargs) s32hive.execute(None) mock_select_key.assert_called_once_with( bucket_name=bucket, key=key, expression=select_expression, input_serialization=input_serialization)