def coalesce_partition(bucket, partition, use_localstack: bool, manifests: bool): client = s3_client(use_localstack) s3 = S3(client) return [ coalesce_batch(s3, bucket, batch, manifests) for batch in partition ]
def test_get_full_prefix_when_not_set_with_end_slash(self): prefix = f"{test_prefix}/" expected = f"{test_prefix}/" date_to_add = "NOT_SET" today = datetime.strptime("2020-09-01", date_format) client = s3_client(True) s3 = S3(client) actual = s3.get_full_s3_prefix(prefix, date_to_add, today) self.assertEqual(expected, actual)
def test_batches_are_deleted(self): batch = [{'object_key': f'prefix/{i}'} for i in range(5)] deletes = [{'Key': x['object_key']} for x in batch] calls = [call(Bucket=self.bucket, Delete={'Objects': deletes})] client = s3_client(True) client.delete_objects = MagicMock(return_value={}) s3 = S3(client) s3.delete_batch(self.bucket, batch) client.delete_objects.assert_has_calls(calls)
def test_get_full_prefix_when_set_to_yesterday(self): prefix = test_prefix expected = f"{test_prefix}/2020/08/31" date_to_add = "yesterday" today = datetime.strptime("2020-09-01", date_format) client = s3_client(True) s3 = S3(client) actual = s3.get_full_s3_prefix(prefix, date_to_add, today) self.assertEqual(expected, actual)
def test_object_summaries(self): client = s3_client(True) s3 = S3(client) objects = [self.__summaries(x) for x in range(10)] contents = [x['Contents'] for x in [xs for xs in [ys for ys in objects]]] expected = reduce(lambda acc, xs: acc + xs, contents) objects[-1]['IsTruncated'] = False client.list_objects_v2 = Mock(side_effect=objects) actual = [] for sub_batch in s3.object_summaries(self.bucket, 'prefix', 5): actual += sub_batch self.assertEqual(expected, actual)
def test_batches_are_deleted_in_chunks(self): batch = [{'object_key': f'prefix/{i}'} for i in range(3500)] sub_batch1 = [{'Key': f'prefix/{i}'} for i in range(1000)] sub_batch2 = [{'Key': f'prefix/{i}'} for i in range(1000, 2000)] sub_batch3 = [{'Key': f'prefix/{i}'} for i in range(2000, 3000)] sub_batch4 = [{'Key': f'prefix/{i}'} for i in range(3000, 3500)] calls = [call(Bucket=self.bucket, Delete={'Objects': x}) for x in [sub_batch1, sub_batch2, sub_batch3, sub_batch4]] client = s3_client(True) client.delete_objects = MagicMock(return_value={}) s3 = S3(client) s3.delete_batch(self.bucket, batch) client.delete_objects.assert_has_calls(calls)
def main(): start = timer() args = command_line_args() client = s3_client(args.localstack) s3 = S3(client) print( f"Bucket: '{args.bucket}', prefix: '{args.prefix}', partition: {args.partition}, " f"threads: {args.threads}, multiprocessor: {args.multiprocessor}, manifests: {args.manifests}." ) results = [ coalesce_tranche(args, summaries) for summaries in s3.object_summaries( args.bucket, args.prefix, args.summaries) ] end = timer() print(f"Total time taken: {end - start:.2f} seconds.") exit(0 if all(results) else 1)
def __client(self): objects = [self.__s3_object_with_body(i) for i in range(1000)] client = s3_client(True) client.get_object = Mock(side_effect=objects) client.upload_fileobj = Mock() return client
def coalesce_batch_parallel(bucket, batch, manifests, use_localstack: bool) -> bool: return coalesce_batch(S3(s3_client(use_localstack)), bucket, batch, manifests)