Esempio n. 1
0
    def test_s3_iter_bucket_moto(self):
        """Does s3_iter_bucket work correctly?"""
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")
        mybucket = conn.get_bucket("mybucket")

        # first, create some keys in the bucket
        expected = {}
        for key_no in range(200):
            key_name = "mykey%s" % key_no
            with smart_open.smart_open("s3://mybucket/%s" % key_name, 'wb') as fout:
                content = '\n'.join("line%i%i" % (key_no, line_no) for line_no in range(10)).encode('utf8')
                fout.write(content)
                expected[key_name] = content

        # read all keys + their content back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(expected, result)

        # read some of the keys back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket, accept_key=lambda fname: fname.endswith('4')))
        self.assertEqual(result, dict((k, c) for k, c in expected.items() if k.endswith('4')))

        # read some of the keys back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket, key_limit=10))
        self.assertEqual(len(result), min(len(expected), 10))

        for workers in [1, 4, 8, 16, 64]:
            self.assertEqual(dict(smart_open.s3_iter_bucket(mybucket, workers=workers)), expected)
Esempio n. 2
0
    def test_s3_iter_bucket_mock(self, mock_pool):
        """Is s3_iter_bucket called correctly?"""
        attrs = {"name" : "fileA", "get_contents_as_string.return_value" : "contentA"}
        mykey = mock.Mock(spec=["name", "get_contents_as_string"])
        mykey.configure_mock(**attrs)

        attrs = {"list.return_value" : [mykey]}
        mybucket = mock.Mock(spec=["list"])
        mybucket.configure_mock(**attrs)

        for key, content in smart_open.s3_iter_bucket(mybucket):
            mock_pool.Pool.assert_called_with(processes=16)
            mock_pool.Pool().imap_unordered.assert_called_with()

        mock_pool.Pool.assert_called_with(processes=16)
        self.assertTrue(mock_pool.Pool().imap_unordered.called)
Esempio n. 3
0
    def test_s3_iter_bucket_with_SSLError_moto(self):
        attrs = {"name" : "fileA", "get_contents_as_string.return_value" : b"contentA"}
        mykey = mock.Mock(spec=["name", "get_contents_as_string"])
        mykey.configure_mock(**attrs)

        attrs = {"list.return_value" : [mykey]}
        mybucket = mock.Mock(spec=["list"])
        mybucket.configure_mock(**attrs)

        # when get_contents_as_string always returns SSLError
        mykey.get_contents_as_string.side_effect = SSLError
        self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)

        # when get_contents_as_string only returns SSLError once, can still recover
        mykey.get_contents_as_string.side_effect = [SSLError, b"contentA"]
        key, content = next(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # when get_contents_as_string fails up to three times, can still recover
        mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, b"contentA"]
        key, content = next(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # but not more than three times ....
        mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"]
        self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)

        # unless you specify more retries ....
        mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"]
        key, content = next(smart_open.s3_iter_bucket(mybucket, retries=4))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # some other exception always fails, and never retries
        mykey.get_contents_as_string.side_effect = [Exception, b"contentA"]
        self.assertRaises(Exception, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)
from boto.s3.connection import S3Connection
import glob
import sys
from smart_open import s3_iter_bucket
#print(sys.argv)

conn=S3Connection(sys.argv[1],sys.argv[2])
bucket=conn.get_bucket('s3-acpcontent')

#keys = bucket.list()


for key, content in s3_iter_bucket(bucket, accept_key=lambda key: key.endswith('.json')):
    print (key, len(content))