class CatTestCase(MockGoogleTestCase): def setUp(self): super(CatTestCase, self).setUp() self.fs = GCSFilesystem() def test_cat_uncompressed(self): self.put_gcs_multi({'gs://walrus/data/foo': b'foo\nfoo\n'}) self.assertEqual(b''.join(self.fs._cat_file('gs://walrus/data/foo')), b'foo\nfoo\n') def test_cat_bz2(self): self.put_gcs_multi( {'gs://walrus/data/foo.bz2': bz2.compress(b'foo\n' * 1000)}) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo.bz2')), b'foo\n' * 1000) def test_cat_gz(self): self.put_gcs_multi( {'gs://walrus/data/foo.gz': gzip_compress(b'foo\n' * 10000)}) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo.gz')), b'foo\n' * 10000) def test_chunks_file(self): self.put_gcs_multi({'gs://walrus/data/foo': b'foo\nfoo\n' * 1000}) self.assertGreater( len(list(self.fs._cat_file('gs://walrus/data/foo'))), 1)
def test_override_location_set_at_init(self): fs = GCSFilesystem(location='us-central1') fs.create_bucket('walrus', location='us-east1') bucket = fs.get_bucket('walrus') self.assertEqual(bucket.location, 'US-EAST1')
def test_blank_out_location_set_at_init(self): fs = GCSFilesystem(location='us-central1') fs.create_bucket('walrus', location='') bucket = fs.get_bucket('walrus') self.assertEqual(bucket.location, 'US')
def test_location(self): fs = GCSFilesystem() fs.create_bucket('walrus', location='us-central1') bucket = fs.get_bucket('walrus') self.assertEqual(bucket.location, 'US-CENTRAL1')
def test_blank_out_object_ttl_days_set_at_init(self): fs = GCSFilesystem(object_ttl_days=234) fs.create_bucket('walrus', object_ttl_days=0) bucket = fs.get_bucket('walrus') self.assertEqual(list(bucket.lifecycle_rules), [])
def test_location_set_at_init(self): fs = GCSFilesystem(location='us-central1') fs.create_bucket('walrus') bucket = fs.get_bucket('walrus') self.assertEqual(bucket.location, 'US-CENTRAL1')
def test_default(self): fs = GCSFilesystem() fs.create_bucket('walrus') bucket = fs.get_bucket('walrus') self.assertEqual(bucket.location, 'US') self.assertEqual(list(bucket.lifecycle_rules), [])
def test_override_object_ttl_days_set_at_init(self): fs = GCSFilesystem(object_ttl_days=234) fs.create_bucket('walrus', object_ttl_days=123) bucket = fs.get_bucket('walrus') self.assertEqual( list(bucket.lifecycle_rules), [dict(action=dict(type='Delete'), condition=dict(age=123))])
def test_put_with_part_size(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() fs = GCSFilesystem(part_size=12345) with patch.object(GCSFilesystem, '_blob') as blob_meth: fs.put(local_path, dest) blob_meth.assert_called_once_with(dest, chunk_size=12345)
def __init__(self, test_case): assert isinstance(test_case, MockGoogleAPITestCase) self._test_case = test_case self._fs = GCSFilesystem() self._cache_objects = dict() self._cache_buckets = dict() self._client_objects = MockGCSClientObjects(self) self._client_buckets = MockGCSClientBuckets(self)
def test_mkdir_bucket(self): fs = GCSFilesystem(location='us-central1', object_ttl_days=123) fs.mkdir('gs://walrus/data') bucket = fs.get_bucket('walrus') self.assertEqual(bucket.location, 'US-CENTRAL1') self.assertEqual( list(bucket.lifecycle_rules), [dict(action=dict(type='Delete'), condition=dict(age=123))])
class CatTestCase(MockGoogleTestCase): def setUp(self): super(CatTestCase, self).setUp() self.fs = GCSFilesystem() def test_cat_uncompressed(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'foo\nfoo\n' }) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo')), b'foo\nfoo\n') def test_cat_bz2(self): self.put_gcs_multi({ 'gs://walrus/data/foo.bz2': bz2.compress(b'foo\n' * 1000) }) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo.bz2')), b'foo\n' * 1000) def test_cat_gz(self): self.put_gcs_multi({ 'gs://walrus/data/foo.gz': gzip_compress(b'foo\n' * 10000) }) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo.gz')), b'foo\n' * 10000) def test_chunks_file(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'foo\nfoo\n' * 10000 }) self.assertGreater( len(list(self.fs._cat_file('gs://walrus/data/foo'))), 1) def test_chunk_boundary(self): # trying to read from end of file raises an exception, which we catch data = b'a' * _CAT_CHUNK_SIZE + b'b' * _CAT_CHUNK_SIZE self.put_gcs_multi({ 'gs://walrus/data/foo': data, }) self.assertEqual( list(self.fs._cat_file('gs://walrus/data/foo')), [b'a' * _CAT_CHUNK_SIZE, b'b' * _CAT_CHUNK_SIZE])
def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['project_id'], location=self._opts['gcs_region'], object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, ), disable_if=_is_permanent_google_error) # Hadoop FS is responsible for all URIs that fall through to it self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['google_project_id'] ), disable_if=_is_permanent_google_error) self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def test_local_tmp_dir_is_deprecated_and_does_nothing(self): fs = GCSFilesystem(local_tmp_dir=self.tmp_dir) self.assertTrue(self.log.warning.called) self.assertEqual(fs.client, self.Client(project=None, credentials=None)) self.assertFalse(hasattr(fs, '_local_tmp_dir'))
def setUp(self): self.fs = GCSFilesystem() self.gcs_path = 'gs://walrus/data' self.list_req_mock = mock.MagicMock() objects_ret = mock.MagicMock() objects_ret.list.return_value = self.list_req_mock objects_ret.get_media.return_value = google_http.HttpRequest( None, None, self.gcs_path) api_client = mock.MagicMock() api_client.objects.return_value = objects_ret self.fs._api_client = api_client self.next_chunk_patch = patch.object( google_http.MediaIoBaseDownload, 'next_chunk')
def test_set_credentials_and_project_id(self): creds = Mock() project_id = 'alan-parsons' fs = GCSFilesystem(credentials=creds, project_id=project_id) self.assertFalse(self.log.warning.called) self.assertEqual(fs.client, self.Client(project=project_id, credentials=creds))
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem() self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs
class CatTestCase(MockGoogleAPITestCase): def setUp(self): super(CatTestCase, self).setUp() self.fs = GCSFilesystem() def test_cat_uncompressed(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'foo\nfoo\n' }) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo')), b'foo\nfoo\n') def test_cat_bz2(self): self.put_gcs_multi({ 'gs://walrus/data/foo.bz2': bz2.compress(b'foo\n' * 1000) }) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo.bz2')), b'foo\n' * 1000) def test_cat_gz(self): self.put_gcs_multi({ 'gs://walrus/data/foo.gz': gzip_compress(b'foo\n' * 10000) }) self.assertEqual( b''.join(self.fs._cat_file('gs://walrus/data/foo.gz')), b'foo\n' * 10000) def test_chunks_file(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'foo\nfoo\n' * 1000 }) self.assertGreater( len(list(self.fs._cat_file('gs://walrus/data/foo'))), 1)
def setUp(self): self.fs = GCSFilesystem() self.gcs_path = "gs://walrus/data" self.list_req_mock = mock.MagicMock() objects_ret = mock.MagicMock() objects_ret.list.return_value = self.list_req_mock objects_ret.get_media.return_value = google_http.HttpRequest(None, None, self.gcs_path) api_client = mock.MagicMock() api_client.objects.return_value = objects_ret self.fs._api_client = api_client self.next_chunk_patch = patch.object(google_http.MediaIoBaseDownload, "next_chunk")
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem( credentials=self._credentials, local_tmp_dir=self._get_local_tmp_dir(), project_id=self._project_id, ) self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() location = self._opts['region'] or _zone_to_region( self._opts['zone']) self._fs.add_fs('gcs', GCSFilesystem( credentials=self._credentials, project_id=self._project_id, part_size=self._upload_part_size(), location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, )) self._fs.add_fs('local', LocalFilesystem()) return self._fs
class GCSFSHTTPErrorTestCase(PatcherTestCase): def setUp(self): self.fs = GCSFilesystem() self.gcs_path = 'gs://walrus/data' self.list_req_mock = mock.MagicMock() objects_ret = mock.MagicMock() objects_ret.list.return_value = self.list_req_mock objects_ret.get_media.return_value = google_http.HttpRequest( None, None, self.gcs_path) api_client = mock.MagicMock() api_client.objects.return_value = objects_ret self.fs._api_client = api_client self.next_chunk_patch = patch.object( google_http.MediaIoBaseDownload, 'next_chunk') def test_list_missing(self): self.list_req_mock.execute.side_effect = _http_exception(404) list(self.fs._ls_detailed(self.gcs_path)) def test_list_actual_error(self): self.list_req_mock.execute.side_effect = _http_exception(500) with self.assertRaises(google_http.HttpError): list(self.fs._ls_detailed(self.gcs_path)) def test_download_io_empty_file(self): io_obj = io.BytesIO() with self.next_chunk_patch as media_io_next_chunk: media_io_next_chunk.side_effect = _http_exception(416) self.fs._download_io(self.gcs_path, io_obj) self.assertEqual(len(io_obj.getvalue()), 0) def test_download_io_actual_error(self): io_obj = io.BytesIO() with self.next_chunk_patch as media_io_next_chunk: media_io_next_chunk.side_effect = _http_exception(500) with self.assertRaises(google_http.HttpError): self.fs._download_io(self.gcs_path, io_obj)
class GCSFSTestCase(MockGoogleTestCase): def setUp(self): super(GCSFSTestCase, self).setUp() self.fs = GCSFilesystem() def test_ls_blob(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'' }) self.assertEqual(list(self.fs.ls('gs://walrus/data/foo')), ['gs://walrus/data/foo']) def test_ls_missing(self): self.assertEqual(list(self.fs.ls('gs://nope/not/here')), []) def test_ls_ignores_dirs(self): # Dataproc (i.e. Hadoop) will create empty blobs whose names end # in '/' self.put_gcs_multi({ 'gs://walrus/data/foo/': b'', 'gs://walrus/data/foo/bar': b'baz', }) self.assertEqual(list(self.fs.ls('gs://walrus/data')), ['gs://walrus/data/foo/bar']) def test_ls_recursively(self): self.put_gcs_multi({ 'gs://walrus/data/bar': b'', 'gs://walrus/data/bar/baz': b'', 'gs://walrus/data/foo': b'', 'gs://walrus/qux': b'', }) uris = [ 'gs://walrus/data/bar', 'gs://walrus/data/bar/baz', 'gs://walrus/data/foo', 'gs://walrus/qux', ] self.assertEqual(set(self.fs.ls('gs://walrus/')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/*')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/data')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/*')), set(uris[:-1])) def test_ls_globs(self): self.put_gcs_multi({ 'gs://w/a': b'', 'gs://w/a/b': b'', 'gs://w/ab': b'', 'gs://w/b': b'', }) self.assertEqual(set(self.fs.ls('gs://w/')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual(set(self.fs.ls('gs://w/*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual(list(self.fs.ls('gs://w/*/')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/*/*')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/a?')), ['gs://w/ab']) # * can match / self.assertEqual(set(self.fs.ls('gs://w/a*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab'])) self.assertEqual(set(self.fs.ls('gs://w/*b')), set(['gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) def test_du(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcde', 'gs://walrus/data/bar/baz': b'fgh' }) self.assertEqual(self.fs.du('gs://walrus/'), 8) self.assertEqual(self.fs.du('gs://walrus/data/foo'), 5) self.assertEqual(self.fs.du('gs://walrus/data/bar/baz'), 3) def test_exists(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcd' }) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar'), False) def test_md5sum(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcd' }) self.assertEqual(self.fs.md5sum('gs://walrus/data/foo'), md5(b'abcd').hexdigest()) def test_md5sum_of_missing_blob(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcd' }) self.assertRaises(IOError, self.fs.md5sum, 'gs://walrus/data/bar') def test_mkdir_creates_buckets(self): self.assertNotIn('walrus', self.mock_gcs_fs) self.fs.mkdir('gs://walrus/data') self.assertIn('walrus', self.mock_gcs_fs) def test_mkdir_does_not_create_directories(self): self.fs.create_bucket('walrus') self.assertEqual(list(self.fs.ls('gs://walrus/')), []) self.fs.mkdir('gs://walrus/data') self.assertEqual(list(self.fs.ls('gs://walrus/')), []) def test_put(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() self.fs.put(local_path, dest) self.assertEqual(b''.join(self.fs.cat(dest)), b'bar') def test_put_with_part_size(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() fs = GCSFilesystem(part_size=12345) with patch.object(GCSFilesystem, '_blob') as blob_meth: fs.put(local_path, dest) blob_meth.assert_called_once_with(dest, chunk_size=12345) def test_put_chunk_size(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() with patch.object(GCSFilesystem, '_blob') as blob_meth: with patch('mrjob.fs.gcs.log') as log: self.fs.put(local_path, dest, chunk_size=99999) blob_meth.assert_called_once_with(dest, chunk_size=99999) self.assertTrue(log.warning.called) def test_rm(self): self.put_gcs_multi({ 'gs://walrus/foo': b'' }) self.assertEqual(self.fs.exists('gs://walrus/foo'), True) self.fs.rm('gs://walrus/foo') self.assertEqual(self.fs.exists('gs://walrus/foo'), False) def test_rm_dir(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'', 'gs://walrus/data/bar/baz': b'', }) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), True) self.fs.rm('gs://walrus/data') self.assertEqual(self.fs.exists('gs://walrus/data/foo'), False) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), False)
class MockGCSClient(object): """Mock out GCSClient... TARGET API VERSION - Storage API v1 Emulates GCS metadata and stores raw bytes Contains convenience functions for initializing items in GCS """ def __init__(self, test_case): assert isinstance(test_case, MockGoogleAPITestCase) self._test_case = test_case self._fs = GCSFilesystem() self._cache_objects = dict() self._cache_buckets = dict() self._client_objects = MockGCSClientObjects(self) self._client_buckets = MockGCSClientBuckets(self) def objects(self): return self._client_objects def buckets(self): return self._client_buckets def put_gcs(self, gcs_uri, data): """Put data at gcs_uri, creating a bucket if necessary""" bucket, name = parse_gcs_uri(gcs_uri) try: self._fs.get_bucket(bucket) except google_errors.HttpError: self._fs.create_bucket(project=_TEST_PROJECT, name=bucket) bytes_io_obj = BytesIO(data) self.upload_io(bytes_io_obj, gcs_uri) def put_gcs_multi(self, gcs_uri_to_data_map): """Bulk put data at gcs_uris""" for gcs_uri, data in gcs_uri_to_data_map.items(): self.put_gcs(gcs_uri, data) def download_io(self, src_uri, io_obj): """ Clobber GCSFilesystem._download_io """ bucket, name = parse_gcs_uri(src_uri) object_dict = _get_deep(self._cache_objects, [bucket, name]) if not object_dict: raise Exception object_data = object_dict['_data'] io_obj.write(object_data) return io_obj def upload_io(self, io_obj, dest_uri): """ Clobber GCSFilesystem._upload_io """ bucket, name = parse_gcs_uri(dest_uri) assert bucket in self._cache_buckets io_obj.seek(0) data = io_obj.read() # TODO - io_obj.close() ? Not sure if callers of this function would # expect their io_objs to be closed object_resp = _insert_object_resp(bucket=bucket, name=name, data=data) _set_deep(self._cache_objects, [bucket, name], object_resp) return object_resp
class GCSFSTestCase(MockGoogleTestCase): def setUp(self): super(GCSFSTestCase, self).setUp() self.fs = GCSFilesystem() def test_ls_blob(self): self.put_gcs_multi({'gs://walrus/data/foo': b''}) self.assertEqual(list(self.fs.ls('gs://walrus/data/foo')), ['gs://walrus/data/foo']) def test_ls_missing(self): self.assertEqual(list(self.fs.ls('gs://nope/not/here')), []) def test_ls_ignores_dirs(self): # Dataproc (i.e. Hadoop) will create empty blobs whose names end # in '/' self.put_gcs_multi({ 'gs://walrus/data/foo/': b'', 'gs://walrus/data/foo/bar': b'baz', }) self.assertEqual(list(self.fs.ls('gs://walrus/data')), ['gs://walrus/data/foo/bar']) def test_ls_recursively(self): self.put_gcs_multi({ 'gs://walrus/data/bar': b'', 'gs://walrus/data/bar/baz': b'', 'gs://walrus/data/foo': b'', 'gs://walrus/qux': b'', }) uris = [ 'gs://walrus/data/bar', 'gs://walrus/data/bar/baz', 'gs://walrus/data/foo', 'gs://walrus/qux', ] self.assertEqual(set(self.fs.ls('gs://walrus/')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/*')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/data')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/*')), set(uris[:-1])) def test_ls_globs(self): self.put_gcs_multi({ 'gs://w/a': b'', 'gs://w/a/b': b'', 'gs://w/ab': b'', 'gs://w/b': b'', }) self.assertEqual( set(self.fs.ls('gs://w/')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual( set(self.fs.ls('gs://w/*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual(list(self.fs.ls('gs://w/*/')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/*/*')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/a?')), ['gs://w/ab']) # * can match / self.assertEqual(set(self.fs.ls('gs://w/a*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab'])) self.assertEqual(set(self.fs.ls('gs://w/*b')), set(['gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) def test_du(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcde', 'gs://walrus/data/bar/baz': b'fgh' }) self.assertEqual(self.fs.du('gs://walrus/'), 8) self.assertEqual(self.fs.du('gs://walrus/data/foo'), 5) self.assertEqual(self.fs.du('gs://walrus/data/bar/baz'), 3) def test_exists(self): self.put_gcs_multi({'gs://walrus/data/foo': b'abcd'}) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar'), False) def test_md5sum(self): self.put_gcs_multi({'gs://walrus/data/foo': b'abcd'}) self.assertEqual(self.fs.md5sum('gs://walrus/data/foo'), md5(b'abcd').hexdigest()) def test_md5sum_of_missing_blob(self): self.put_gcs_multi({'gs://walrus/data/foo': b'abcd'}) self.assertRaises(IOError, self.fs.md5sum, 'gs://walrus/data/bar') def test_rm(self): self.put_gcs_multi({'gs://walrus/foo': b''}) self.assertEqual(self.fs.exists('gs://walrus/foo'), True) self.fs.rm('gs://walrus/foo') self.assertEqual(self.fs.exists('gs://walrus/foo'), False) def test_rm_dir(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'', 'gs://walrus/data/bar/baz': b'', }) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), True) self.fs.rm('gs://walrus/data') self.assertEqual(self.fs.exists('gs://walrus/data/foo'), False) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), False)
def test_default(self): fs = GCSFilesystem() self.assertFalse(self.log.warning.called) self.assertEqual(fs.client, self.Client(project=None, credentials=None))
def _make_bucket(self, name, location=None): fs = GCSFilesystem() fs.create_bucket(name, location=location)
def setUp(self): super(CatTestCase, self).setUp() self.fs = GCSFilesystem()
class GCSFSTestCase(MockGoogleTestCase): def setUp(self): super(GCSFSTestCase, self).setUp() self.fs = GCSFilesystem() def test_ls_blob(self): self.put_gcs_multi({'gs://walrus/data/foo': b''}) self.assertEqual(list(self.fs.ls('gs://walrus/data/foo')), ['gs://walrus/data/foo']) def test_ls_missing(self): self.assertEqual(list(self.fs.ls('gs://nope/not/here')), []) def test_ls_ignores_dirs(self): # Dataproc (i.e. Hadoop) will create empty blobs whose names end # in '/' self.put_gcs_multi({ 'gs://walrus/data/foo/': b'', 'gs://walrus/data/foo/bar': b'baz', }) self.assertEqual(list(self.fs.ls('gs://walrus/data')), ['gs://walrus/data/foo/bar']) def test_ls_recursively(self): self.put_gcs_multi({ 'gs://walrus/data/bar': b'', 'gs://walrus/data/bar/baz': b'', 'gs://walrus/data/foo': b'', 'gs://walrus/qux': b'', }) uris = [ 'gs://walrus/data/bar', 'gs://walrus/data/bar/baz', 'gs://walrus/data/foo', 'gs://walrus/qux', ] self.assertEqual(set(self.fs.ls('gs://walrus/')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/*')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/data')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/*')), set(uris[:-1])) def test_ls_globs(self): self.put_gcs_multi({ 'gs://w/a': b'', 'gs://w/a/b': b'', 'gs://w/ab': b'', 'gs://w/b': b'', }) self.assertEqual( set(self.fs.ls('gs://w/')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual( set(self.fs.ls('gs://w/*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual(list(self.fs.ls('gs://w/*/')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/*/*')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/a?')), ['gs://w/ab']) # * can match / self.assertEqual(set(self.fs.ls('gs://w/a*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab'])) self.assertEqual(set(self.fs.ls('gs://w/*b')), set(['gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) def test_du(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcde', 'gs://walrus/data/bar/baz': b'fgh' }) self.assertEqual(self.fs.du('gs://walrus/'), 8) self.assertEqual(self.fs.du('gs://walrus/data/foo'), 5) self.assertEqual(self.fs.du('gs://walrus/data/bar/baz'), 3) def test_exists(self): self.put_gcs_multi({'gs://walrus/data/foo': b'abcd'}) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar'), False) def test_md5sum(self): self.put_gcs_multi({'gs://walrus/data/foo': b'abcd'}) self.assertEqual(self.fs.md5sum('gs://walrus/data/foo'), md5(b'abcd').hexdigest()) def test_md5sum_of_missing_blob(self): self.put_gcs_multi({'gs://walrus/data/foo': b'abcd'}) self.assertRaises(IOError, self.fs.md5sum, 'gs://walrus/data/bar') def test_mkdir_creates_buckets(self): self.assertNotIn('walrus', self.mock_gcs_fs) self.fs.mkdir('gs://walrus/data') self.assertIn('walrus', self.mock_gcs_fs) def test_mkdir_does_not_create_directories(self): self.fs.create_bucket('walrus') self.assertEqual(list(self.fs.ls('gs://walrus/')), []) self.fs.mkdir('gs://walrus/data') self.assertEqual(list(self.fs.ls('gs://walrus/')), []) def test_put(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() self.fs.put(local_path, dest) self.assertEqual(b''.join(self.fs.cat(dest)), b'bar') def test_put_with_part_size(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() fs = GCSFilesystem(part_size=12345) with patch.object(GCSFilesystem, '_blob') as blob_meth: fs.put(local_path, dest) blob_meth.assert_called_once_with(dest, chunk_size=12345) def test_put_chunk_size(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() with patch.object(GCSFilesystem, '_blob') as blob_meth: with patch('mrjob.fs.gcs.log') as log: self.fs.put(local_path, dest, chunk_size=99999) blob_meth.assert_called_once_with(dest, chunk_size=99999) self.assertTrue(log.warning.called) def test_rm(self): self.put_gcs_multi({'gs://walrus/foo': b''}) self.assertEqual(self.fs.exists('gs://walrus/foo'), True) self.fs.rm('gs://walrus/foo') self.assertEqual(self.fs.exists('gs://walrus/foo'), False) def test_rm_dir(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'', 'gs://walrus/data/bar/baz': b'', }) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), True) self.fs.rm('gs://walrus/data') self.assertEqual(self.fs.exists('gs://walrus/data/foo'), False) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), False)
class GCSFSTestCase(MockGoogleAPITestCase): def setUp(self): super(GCSFSTestCase, self).setUp() self.fs = GCSFilesystem() def test_cat_uncompressed(self): self.put_gcs_multi({"gs://walrus/data/foo": b"foo\nfoo\n"}) self.assertEqual(list(self.fs._cat_file("gs://walrus/data/foo")), [b"foo\n", b"foo\n"]) def test_cat_bz2(self): self.put_gcs_multi({"gs://walrus/data/foo.bz2": bz2.compress(b"foo\n" * 1000)}) self.assertEqual(list(self.fs._cat_file("gs://walrus/data/foo.bz2")), [b"foo\n"] * 1000) def test_cat_gz(self): self.put_gcs_multi({"gs://walrus/data/foo.gz": gzip_compress(b"foo\n" * 10000)}) self.assertEqual(list(self.fs._cat_file("gs://walrus/data/foo.gz")), [b"foo\n"] * 10000) def test_ls_key(self): self.put_gcs_multi({"gs://walrus/data/foo": b""}) self.assertEqual(list(self.fs.ls("gs://walrus/data/foo")), ["gs://walrus/data/foo"]) def test_ls_recursively(self): self.put_gcs_multi( { "gs://walrus/data/bar": b"", "gs://walrus/data/bar/baz": b"", "gs://walrus/data/foo": b"", "gs://walrus/qux": b"", } ) uris = ["gs://walrus/data/bar", "gs://walrus/data/bar/baz", "gs://walrus/data/foo", "gs://walrus/qux"] self.assertEqual(set(self.fs.ls("gs://walrus/")), set(uris)) self.assertEqual(set(self.fs.ls("gs://walrus/*")), set(uris)) self.assertEqual(set(self.fs.ls("gs://walrus/data")), set(uris[:-1])) self.assertEqual(set(self.fs.ls("gs://walrus/data/")), set(uris[:-1])) self.assertEqual(set(self.fs.ls("gs://walrus/data/*")), set(uris[:-1])) def test_ls_globs(self): self.put_gcs_multi({"gs://w/a": b"", "gs://w/a/b": b"", "gs://w/ab": b"", "gs://w/b": b""}) self.assertEqual(set(self.fs.ls("gs://w/")), set(["gs://w/a", "gs://w/a/b", "gs://w/ab", "gs://w/b"])) self.assertEqual(set(self.fs.ls("gs://w/*")), set(["gs://w/a", "gs://w/a/b", "gs://w/ab", "gs://w/b"])) self.assertEqual(list(self.fs.ls("gs://w/*/")), ["gs://w/a/b"]) self.assertEqual(list(self.fs.ls("gs://w/*/*")), ["gs://w/a/b"]) self.assertEqual(list(self.fs.ls("gs://w/a?")), ["gs://w/ab"]) # * can match / self.assertEqual(set(self.fs.ls("gs://w/a*")), set(["gs://w/a", "gs://w/a/b", "gs://w/ab"])) self.assertEqual(set(self.fs.ls("gs://w/*b")), set(["gs://w/a/b", "gs://w/ab", "gs://w/b"])) def test_du(self): self.put_gcs_multi({"gs://walrus/data/foo": b"abcde", "gs://walrus/data/bar/baz": b"fgh"}) self.assertEqual(self.fs.du("gs://walrus/"), 8) self.assertEqual(self.fs.du("gs://walrus/data/foo"), 5) self.assertEqual(self.fs.du("gs://walrus/data/bar/baz"), 3) def test_exists(self): self.put_gcs_multi({"gs://walrus/data/foo": b"abcd"}) self.assertEqual(self.fs.exists("gs://walrus/data/foo"), True) self.assertEqual(self.fs.exists("gs://walrus/data/bar"), False) def test_rm(self): self.put_gcs_multi({"gs://walrus/foo": b""}) self.assertEqual(self.fs.exists("gs://walrus/foo"), True) self.fs.rm("gs://walrus/foo") self.assertEqual(self.fs.exists("gs://walrus/foo"), False) def test_rm_dir(self): self.put_gcs_multi({"gs://walrus/data/foo": b"", "gs://walrus/data/bar/baz": b""}) self.assertEqual(self.fs.exists("gs://walrus/data/foo"), True) self.assertEqual(self.fs.exists("gs://walrus/data/bar/baz"), True) self.fs.rm("gs://walrus/data") self.assertEqual(self.fs.exists("gs://walrus/data/foo"), False) self.assertEqual(self.fs.exists("gs://walrus/data/bar/baz"), False)
def setUp(self): super(GCSFSTestCase, self).setUp() self.fs = GCSFilesystem()
class GCSFSTestCase(MockGoogleAPITestCase): def setUp(self): super(GCSFSTestCase, self).setUp() self.fs = GCSFilesystem() def test_ls_key(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'' }) self.assertEqual(list(self.fs.ls('gs://walrus/data/foo')), ['gs://walrus/data/foo']) def test_ls_recursively(self): self.put_gcs_multi({ 'gs://walrus/data/bar': b'', 'gs://walrus/data/bar/baz': b'', 'gs://walrus/data/foo': b'', 'gs://walrus/qux': b'', }) uris = [ 'gs://walrus/data/bar', 'gs://walrus/data/bar/baz', 'gs://walrus/data/foo', 'gs://walrus/qux', ] self.assertEqual(set(self.fs.ls('gs://walrus/')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/*')), set(uris)) self.assertEqual(set(self.fs.ls('gs://walrus/data')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/')), set(uris[:-1])) self.assertEqual(set(self.fs.ls('gs://walrus/data/*')), set(uris[:-1])) def test_ls_globs(self): self.put_gcs_multi({ 'gs://w/a': b'', 'gs://w/a/b': b'', 'gs://w/ab': b'', 'gs://w/b': b'', }) self.assertEqual(set(self.fs.ls('gs://w/')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual(set(self.fs.ls('gs://w/*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) self.assertEqual(list(self.fs.ls('gs://w/*/')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/*/*')), ['gs://w/a/b']) self.assertEqual(list(self.fs.ls('gs://w/a?')), ['gs://w/ab']) # * can match / self.assertEqual(set(self.fs.ls('gs://w/a*')), set(['gs://w/a', 'gs://w/a/b', 'gs://w/ab'])) self.assertEqual(set(self.fs.ls('gs://w/*b')), set(['gs://w/a/b', 'gs://w/ab', 'gs://w/b'])) def test_du(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcde', 'gs://walrus/data/bar/baz': b'fgh' }) self.assertEqual(self.fs.du('gs://walrus/'), 8) self.assertEqual(self.fs.du('gs://walrus/data/foo'), 5) self.assertEqual(self.fs.du('gs://walrus/data/bar/baz'), 3) def test_exists(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'abcd' }) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar'), False) def test_rm(self): self.put_gcs_multi({ 'gs://walrus/foo': b'' }) self.assertEqual(self.fs.exists('gs://walrus/foo'), True) self.fs.rm('gs://walrus/foo') self.assertEqual(self.fs.exists('gs://walrus/foo'), False) def test_rm_dir(self): self.put_gcs_multi({ 'gs://walrus/data/foo': b'', 'gs://walrus/data/bar/baz': b'', }) self.assertEqual(self.fs.exists('gs://walrus/data/foo'), True) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), True) self.fs.rm('gs://walrus/data') self.assertEqual(self.fs.exists('gs://walrus/data/foo'), False) self.assertEqual(self.fs.exists('gs://walrus/data/bar/baz'), False)