def test_s3_iter_bucket_moto(self): """Does s3_iter_bucket work correctly?""" conn = boto.connect_s3() conn.create_bucket("mybucket") mybucket = conn.get_bucket("mybucket") # first, create some keys in the bucket expected = {} for key_no in range(200): key_name = "mykey%s" % key_no with smart_open.smart_open("s3://mybucket/%s" % key_name, 'wb') as fout: content = '\n'.join("line%i%i" % (key_no, line_no) for line_no in range(10)) fout.write(content) expected[key_name] = content # read all keys + their content back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(expected, result) # read some of the keys back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket, accept_key=lambda fname: fname.endswith('4'))) self.assertEqual(result, dict((k, c) for k, c in expected.items() if k.endswith('4'))) # read some of the keys back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket, key_limit=10)) self.assertEqual(len(result), min(len(expected), 10)) for workers in [1, 4, 8, 16, 64]: self.assertEqual(dict(smart_open.s3_iter_bucket(mybucket, workers=workers)), expected)
def test_s3_iter_bucket_moto(self): """Does s3_iter_bucket work correctly?""" conn = boto.connect_s3() conn.create_bucket("mybucket") mybucket = conn.get_bucket("mybucket") # first, create some keys in the bucket expected = {} for key_no in range(200): key_name = "mykey%s" % key_no with smart_open.smart_open("s3://mybucket/%s" % key_name, 'wb') as fout: content = '\n'.join("line%i%i" % (key_no, line_no) for line_no in range(10)).encode('utf8') fout.write(content) expected[key_name] = content # read all keys + their content back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(expected, result) # read some of the keys back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket, accept_key=lambda fname: fname.endswith('4'))) self.assertEqual(result, dict((k, c) for k, c in expected.items() if k.endswith('4'))) # read some of the keys back, in parallel, using s3_iter_bucket result = dict(smart_open.s3_iter_bucket(mybucket, key_limit=10)) self.assertEqual(len(result), min(len(expected), 10)) for workers in [1, 4, 8, 16, 64]: self.assertEqual(dict(smart_open.s3_iter_bucket(mybucket, workers=workers)), expected)
def test_deprecated_top_level_s3_iter_bucket(self): populate_bucket() with self.assertLogs(smart_open.logger.name, level='WARN') as cm: # invoking once will generate a warning smart_open.s3_iter_bucket(BUCKET_NAME) # invoking again will not (to reduce spam) smart_open.s3_iter_bucket(BUCKET_NAME) # verify only one output assert len(cm.output) == 1 # verify the suggested new import is in the warning assert "from smart_open.s3 import iter_bucket as s3_iter_bucket" in cm.output[0]
def load_dir_jls(conn, path): bucket, key = get_bucket_key(path) bucket = conn.get_bucket(bucket) for name, content in tqdm( s3_iter_bucket(bucket, key, accept_key=lambda name: name.endswith('.jl'))): for line in content.decode('utf8').split('\n'): if line: yield json.loads(line)
def getRecordFromBucket(BUCKET,PREFIX): records = [] for key, file in s3_iter_bucket(BUCKET,prefix=PREFIX,\ accept_key=lambda key:'part' in key): temp = file.decode('utf-8').strip().split('\n') if isinstance(temp, str): records.append(temp) elif isinstance(temp,list): records += temp return records
def test_s3_iter_bucket_mock(self, mock_pool): """Is s3_iter_bucket called correctly?""" attrs = {"name" : "fileA", "get_contents_as_string.return_value" : "contentA"} mykey = mock.Mock(spec=["name", "get_contents_as_string"]) mykey.configure_mock(**attrs) attrs = {"list.return_value" : [mykey]} mybucket = mock.Mock(spec=["list"]) mybucket.configure_mock(**attrs) for key, content in smart_open.s3_iter_bucket(mybucket): mock_pool.Pool.assert_called_with(processes=16) mock_pool.Pool().imap_unordered.assert_called_with() mock_pool.Pool.assert_called_with(processes=16) self.assertTrue(mock_pool.Pool().imap_unordered.called)
def test_s3_iter_bucket_with_SSLError_moto(self): attrs = { "name": "fileA", "get_contents_as_string.return_value": b"contentA" } mykey = mock.Mock(spec=["name", "get_contents_as_string"]) mykey.configure_mock(**attrs) attrs = {"list.return_value": [mykey]} mybucket = mock.Mock(spec=["list"]) mybucket.configure_mock(**attrs) # when get_contents_as_string always returns SSLError mykey.get_contents_as_string.side_effect = SSLError self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket) # when get_contents_as_string only returns SSLError once, can still recover mykey.get_contents_as_string.side_effect = [SSLError, b"contentA"] key, content = next(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # when get_contents_as_string fails up to three times, can still recover mykey.get_contents_as_string.side_effect = [ SSLError, SSLError, SSLError, b"contentA" ] key, content = next(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # but not more than three times .... mykey.get_contents_as_string.side_effect = [ SSLError, SSLError, SSLError, SSLError, b"contentA" ] self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket) # unless you specify more retries .... mykey.get_contents_as_string.side_effect = [ SSLError, SSLError, SSLError, SSLError, b"contentA" ] key, content = next(smart_open.s3_iter_bucket(mybucket, retries=4)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # some other exception always fails, and never retries mykey.get_contents_as_string.side_effect = [Exception, b"contentA"] self.assertRaises(Exception, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)
def s3_get_articles(issue, bucket, workers=None): """Read a newspaper issue from S3 and return the articles it contains. :param issue: the newspaper issue :type issue: an instance of `impresso_commons.path.IssueDir` :param bucket: the input s3 bucket :type bucket: `boto.s3.bucket.Bucket` :param workers: number of workers for the s3_iter_bucket function. If None, will be the number of detected CPUs. :return: a list of articles (dictionaries) NB: Content items with type = "ad" (advertisement) are filtered out. """ nb_workers = _get_cores() if workers is None else workers issue_data = list( s3_iter_bucket(bucket, prefix=issue.path, workers=nb_workers)) print(issue_data) issue_data = issue_data[0][1] issue_json = json.loads(issue_data.decode('utf-8')) articles = [ item for item in issue_json["i"] if item["m"]["tp"] == "article" ] return articles
def test_s3_iter_bucket_with_SSLError_moto(self): attrs = {"name" : "fileA", "get_contents_as_string.return_value" : b"contentA"} mykey = mock.Mock(spec=["name", "get_contents_as_string"]) mykey.configure_mock(**attrs) attrs = {"list.return_value" : [mykey]} mybucket = mock.Mock(spec=["list"]) mybucket.configure_mock(**attrs) # when get_contents_as_string always returns SSLError mykey.get_contents_as_string.side_effect = SSLError self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket) # when get_contents_as_string only returns SSLError once, can still recover mykey.get_contents_as_string.side_effect = [SSLError, b"contentA"] key, content = next(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # when get_contents_as_string fails up to three times, can still recover mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, b"contentA"] key, content = next(smart_open.s3_iter_bucket(mybucket)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # but not more than three times .... mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"] self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket) # unless you specify more retries .... mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"] key, content = next(smart_open.s3_iter_bucket(mybucket, retries=4)) self.assertEqual(key, mykey) self.assertEqual(content, b"contentA") # some other exception always fails, and never retries mykey.get_contents_as_string.side_effect = [Exception, b"contentA"] self.assertRaises(Exception, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)
from boto.s3.connection import S3Connection import glob import sys from smart_open import s3_iter_bucket #print(sys.argv) conn=S3Connection(sys.argv[1],sys.argv[2]) bucket=conn.get_bucket('s3-acpcontent') #keys = bucket.list() for key, content in s3_iter_bucket(bucket, accept_key=lambda key: key.endswith('.json')): print (key, len(content))
def smart_walk(path, load=False, raise_exc=False, accept_path=const_true, **s3_args): """ walk a directory """ # TODO spaghetti code - disentangle! url = urlparse(to_str(path)) if isinstance(path, (bytes, str)) else path path = url.path if url.path.endswith( os.path.sep) else url.path + os.path.sep if url.scheme == "s3": try: import boto bucket = boto.connect_s3().get_bucket(url.hostname, validate=True) except Exception as exc: LOGGER.exception(exc) if raise_exc: raise exc return path = path[1:] if not load: try: for obj in bucket.list(prefix=path): if accept_path(obj.key): yield smart_url(scheme="s3", hostname=url.hostname, path=obj.key), None except Exception as exc: LOGGER.exception(exc) if raise_exc: raise exc return try: from smart_open import s3_iter_bucket except ImportError as exc: LOGGER.error("<smart_open> library must be importable") LOGGER.exception(exc) if raise_exc: raise exc return try: for key, content in s3_iter_bucket(bucket, prefix=path, accept_key=accept_path, **s3_args): yield smart_url(scheme="s3", hostname=bucket.name, path=key.key), content except Exception as exc: LOGGER.exception(exc) if raise_exc: raise exc return path = os.path.abspath(path) for sub_dir, _, file_paths in os.walk(path): for file_path in file_paths: file_path = os.path.join(sub_dir, file_path) if not accept_path(file_path): continue url = smart_url(scheme="file", path=file_path) if not load: yield url, None continue try: with open(file_path, "rb") as file_obj: yield url, file_obj.read() except Exception as exc: LOGGER.exception(exc) if raise_exc: raise exc