def test_s3_iter_bucket_moto(self): """Does s3_iter_bucket work correctly?""" conn = boto.connect_s3() conn.create_bucket("mybucket") mybucket = conn.get_bucket("mybucket") # first, create some keys in the bucket expected = {} for key_no in range(200): key_name = "mykey%s" % key_no with smart_open.smart_open("s3://mybucket/%s" % key_name, 'wb') as fout: content = '\n'.join("line%i%i" % (key_no, line_no) for line_no in range(10)).encode('utf8') fout.write(content) expected[key_name] = content # read all keys + their content back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(expected, result) # read some of the keys back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket, accept_key=lambda fname: fname.endswith('4'))) self.assertEqual(result, dict((k, c) for k, c in expected.items() if k.endswith('4'))) # read some of the keys back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket, key_limit=10)) self.assertEqual(len(result), min(len(expected), 10)) for workers in [1, 4, 8, 16, 64]: self.assertEqual(dict(smart_open.s3_iter_bucket(mybucket, workers=workers)), expected)
def test_s3_iter_bucket_mock(self, mock_pool): """Is s3_iter_bucket called correctly?""" attrs = {"name" : "fileA", "get_contents_as_string.return_value" : "contentA"} mykey = mock.Mock(spec=["name", "get_contents_as_string"]) mykey.configure_mock(**attrs) attrs = {"list.return_value" : [mykey]} mybucket = mock.Mock(spec=["list"]) mybucket.configure_mock(**attrs) for key, content in smart_open.s3_iter_bucket(mybucket): mock_pool.Pool.assert_called_with(processes=16) mock_pool.Pool().imap_unordered.assert_called_with() mock_pool.Pool.assert_called_with(processes=16) self.assertTrue(mock_pool.Pool().imap_unordered.called)
def test_s3_iter_bucket_with_SSLError_moto(self): attrs = {"name" : "fileA", "get_contents_as_string.return_value" : b"contentA"} mykey = mock.Mock(spec=["name", "get_contents_as_string"]) mykey.configure_mock(**attrs) attrs = {"list.return_value" : [mykey]} mybucket = mock.Mock(spec=["list"]) mybucket.configure_mock(**attrs) # when get_contents_as_string always returns SSLError mykey.get_contents_as_string.side_effect = SSLError self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket) # when get_contents_as_string only returns SSLError once, can still recover mykey.get_contents_as_string.side_effect = [SSLError, b"contentA"] key, content = next(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # when get_contents_as_string fails up to three times, can still recover mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, b"contentA"] key, content = next(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # but not more than three times .... mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"] self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket) # unless you specify more retries .... mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"] key, content = next(smart_open.s3_iter_bucket(mybucket, retries=4)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # some other exception always fails, and never retries mykey.get_contents_as_string.side_effect = [Exception, b"contentA"] self.assertRaises(Exception, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)
from boto.s3.connection import S3Connection import glob import sys from smart_open import s3_iter_bucket #print(sys.argv) conn=S3Connection(sys.argv[1],sys.argv[2]) bucket=conn.get_bucket('s3-acpcontent') #keys = bucket.list() for key, content in s3_iter_bucket(bucket, accept_key=lambda key: key.endswith('.json')): print (key, len(content))