def _rename_gcs_batch(src_dest_pairs): # Prepare batches. gcs_batches = [] gcs_current_batch = [] for src, dest in src_dest_pairs: gcs_current_batch.append((src, dest)) if len(gcs_current_batch) == gcsio.MAX_BATCH_OPERATION_SIZE: gcs_batches.append(gcs_current_batch) gcs_current_batch = [] if gcs_current_batch: gcs_batches.append(gcs_current_batch) # Execute GCS renames if any and return exceptions. exceptions = [] for batch in gcs_batches: copy_statuses = gcsio.GcsIO().copy_batch(batch) copy_succeeded = [] for src, dest, exception in copy_statuses: if exception: exceptions.append((src, dest, exception)) else: copy_succeeded.append((src, dest)) delete_batch = [src for src, dest in copy_succeeded] delete_statuses = gcsio.GcsIO().delete_batch(delete_batch) for i, (src, exception) in enumerate(delete_statuses): dest = copy_succeeded[i] if exception: exceptions.append((src, dest, exception)) return exceptions
def open_local_or_gcs(path, mode): """Opens the given path.""" if path.startswith('gs://'): try: return gcsio.GcsIO().open(path, mode) except Exception as e: # pylint: disable=broad-except # Currently we retry exactly once, to work around flaky gcs calls. logging.error('Retrying after exception reading gcs file: %s', e) time.sleep(10) return gcsio.GcsIO().open(path, mode) else: return open(path, mode)
def size_of_files_in_glob(path, file_names=None): """Returns a map of file names to sizes. Args: path: a file path pattern that reads the size of all the files file_names: List of file names that we need size for, this is added to support eventually consistent sources where two expantions of glob might yield to different files. """ if path.startswith('gs://'): file_sizes = gcsio.GcsIO().size_of_files_in_glob(path) if file_names is None: return file_sizes else: result = {} # We need to make sure we fetched the size for all the files as the # list API in GCS is eventually consistent so directly call size for # any files that may be missing. for file_name in file_names: if file_name in file_sizes: result[file_name] = file_sizes[file_name] else: result[file_name] = ChannelFactory.size_in_bytes( file_name) return result else: if file_names is None: file_names = ChannelFactory.glob(path) return { file_name: ChannelFactory.size_in_bytes(file_name) for file_name in file_names }
def glob_files(path): """Glob the given path.""" if path.startswith('gs://'): return gcsio.GcsIO().glob(path) else: return glob.glob(path)
def rm(path): if path.startswith('gs://'): gcsio.GcsIO().delete(path) else: try: os.remove(path) except OSError as err: raise IOError(err)
def size_in_bytes(path): """Returns the size of a file in bytes. Args: path: a string that gives the path of a single file. """ if path.startswith('gs://'): return gcsio.GcsIO().size(path) else: return os.path.getsize(path)
def rename(src, dest): if src.startswith('gs://'): if not dest.startswith('gs://'): raise ValueError('Destination %r must be GCS path.', dest) gcsio.GcsIO().rename(src, dest) else: try: os.rename(src, dest) except OSError as err: raise IOError(err)
def setUp(self): with open(testdata_util.get_full_file_path('Y.vcf.bgz'), mode='rb') as file_to_read: data = file_to_read.readlines() self._data = b''.join(data) self.client = gcsio_test.FakeGcsClient() self.gcs = gcsio.GcsIO(self.client) self._file_name = 'gs://bucket/test' bucket, name = gcsio.parse_gcs_path(self._file_name) self.client.objects.add_file( gcsio_test.FakeFile(bucket, name, self._data, 1))
def rmdir(path): if path.startswith('gs://'): gcs = gcsio.GcsIO() if not path.endswith('/'): path += '/' # TODO: Threadpool? for entry in gcs.glob(path + '*'): gcs.delete(entry) else: try: shutil.rmtree(path) except OSError as err: raise IOError(err)
def copytree(src, dest): if src.startswith('gs://'): if not dest.startswith('gs://'): raise ValueError('Destination %r must be GCS path.', dest) assert src.endswith('/'), src assert dest.endswith('/'), dest gcsio.GcsIO().copytree(src, dest) else: try: if os.path.exists(dest): shutil.rmtree(dest) shutil.copytree(src, dest) except OSError as err: raise IOError(err)
def open(path, mode, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) elif not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError('compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) if path.startswith('gs://'): raw_file = gcsio.GcsIO().open( path, mode, mime_type=CompressionTypes.mime_type(compression_type, mime_type)) else: raw_file = open(path, mode) if compression_type == CompressionTypes.UNCOMPRESSED: return raw_file else: return _CompressedFile(raw_file, compression_type=compression_type)
def setUp(self): self.client = FakeGcsClient() self.gcs = gcsio.GcsIO(self.client)
def glob(path, limit=None): if path.startswith('gs://'): return gcsio.GcsIO().glob(path, limit) else: files = glob.glob(path) return files[:limit]
def exists(path): if path.startswith('gs://'): return gcsio.GcsIO().exists(path) else: return os.path.exists(path)
def glob_files(path): if path.startswith('gs://'): return gcsio.GcsIO().glob(path) else: return glob.glob(path)
def file_exists(path): """Returns whether the file exists.""" if path.startswith('gs://'): return gcsio.GcsIO().exists(path) else: return os.path.exists(path)