def test_get_filesystem(self): self.assertTrue(isinstance(FileSystems.get_filesystem('/tmp'), localfilesystem.LocalFileSystem)) self.assertTrue(isinstance(FileSystems.get_filesystem('c:\\abc\def'), # pylint: disable=anomalous-backslash-in-string localfilesystem.LocalFileSystem)) with self.assertRaises(ValueError): FileSystems.get_filesystem('error://abc/def')
def test_get_filesystem(self): self.assertTrue( isinstance(FileSystems.get_filesystem('/tmp'), localfilesystem.LocalFileSystem)) self.assertTrue( isinstance(FileSystems.get_filesystem('c:\\abc\\def'), localfilesystem.LocalFileSystem)) with self.assertRaises(ValueError): FileSystems.get_filesystem('error://abc/def')
def get_file_size(file_path): """ Gets the size of the file, in bytes. If file is not found, raises a FileNotFoundError. """ linked_bundle_path = parse_linked_bundle_url(file_path) if linked_bundle_path.uses_beam and linked_bundle_path.is_archive: # If no archive subpath is specified for a .tar.gz or .gz file, get the uncompressed size of the entire file, # or the compressed size of the entire directory. if not linked_bundle_path.archive_subpath: if linked_bundle_path.is_archive_dir: filesystem = FileSystems.get_filesystem( linked_bundle_path.bundle_path) return filesystem.size(linked_bundle_path.bundle_path) else: with OpenFile(linked_bundle_path.bundle_path, 'rb') as fileobj: fileobj.seek(0, os.SEEK_END) return fileobj.tell() # If the archive file is a .tar.gz file on Azure, open the specified archive subpath within the archive. # If it is a .gz file on Azure, open the "/contents" entry, which represents the actual gzipped file. with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf: assert linked_bundle_path.is_archive_dir fpath = "/" + linked_bundle_path.archive_subpath finfo = tf.getFileInfo(fpath) if finfo is None: raise FileNotFoundError(fpath) return finfo.size if not get_path_exists(file_path): raise FileNotFoundError(file_path) # Local path return os.stat(file_path).st_size
def path_exists(path, d_pl_options, is_dir): dir_path = path fs = FileSystems.get_filesystem(dir_path) if type(fs) == GCSFileSystem: dir_path = gcs_correct_dir_path_form( dir_path, d_pl_options, strip_prefix=False) if is_dir else path return FileSystems.exists(dir_path), dir_path
def list_dir(dir_path, d_pl_options, exclude_subdir=False): fs = FileSystems.get_filesystem(dir_path) if type(fs) == GCSFileSystem: return gcsio.GcsIO().list_prefix( gcs_correct_dir_path_form(dir_path, d_pl_options, strip_prefix=False)) else: return tf.io.gfile.listdir(dir_path)
def get_file_size(file_path): """ Gets the size of the file, in bytes. If file is not found, raises a FileNotFoundError. """ if not get_path_exists(file_path): raise FileNotFoundError # TODO: add a FileSystems.size() method to Apache Beam to make this less verbose. filesystem = FileSystems.get_filesystem(file_path) return filesystem.size(file_path)
def make_dirs(path, d_pl_options): fs = FileSystems.get_filesystem(path) if type(fs) == GCSFileSystem: gcs_form_path = gcs_correct_dir_path_form(path, d_pl_options, strip_prefix=True) blob_path = get_gcs_bucket(d_pl_options).blob(gcs_form_path) blob_path_create_result = blob_path.upload_from_string( '', content_type='application/x-www-form-urlencoded;charset=UTF-8') return blob_path_create_result else: dir_creation_result = None try: dir_creation_result = FileSystems.mkdirs(path) except Exception as e: # if e is not None: # print(e) pass return dir_creation_result
def get_file_size(fpath): fs = FileSystems.get_filesystem(fpath) if type(fs) == GCSFileSystem: return gcsio.GcsIO().size(fpath) else: return FileIO(fpath, "rb").size()
def delete_file(path, d_pl_options, recursive=False, r_level=0, debug=False): fs = FileSystems.get_filesystem(path) if type(fs) == GCSFileSystem: gcs_client = get_gcs_client() if debug: print( f"{'-'*(r_level)} delete_file (debug): path: {path}, recursive: {recursive}" ) if recursive: child_paths = list_dir(path, d_pl_options, exclude_subdir=False) for child_path in child_paths: if child_path != path: if debug: print( f"{'-'*(r_level+1)} delete_file (debug): path {path} has child: {child_path}" ) delete_file( child_path, d_pl_options, recursive=True, r_level=r_level + 1 ) # don't need to recurse (return, since gcsio deletes all leaves from the root) # not stripped, not corrrected case blob_path = get_gcs_bucket(d_pl_options).blob(path) path_not_stripped_not_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {path_not_stripped_not_gcs_corrected_exists}" ) if path_not_stripped_not_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: # not stripped, gcs corrected case path_not_stripped_gcs_corrected = gcs_correct_dir_path_form( path, d_pl_options, strip_prefix=False) blob_path = get_gcs_bucket(d_pl_options).blob( path_not_stripped_gcs_corrected) path_not_stripped_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {path_not_stripped_gcs_corrected_exists}" ) if path_not_stripped_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: # stripped, not gcs corrected case path_stripped_not_gcs_corrected = gcs_path_strip_prefix( path, d_pl_options) blob_path = get_gcs_bucket(d_pl_options).blob( path_stripped_not_gcs_corrected) path_stripped_not_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {path_stripped_not_gcs_corrected_exists}" ) if path_stripped_not_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: # stripped, gcs corrected case path_stripped_gcs_corrected = gcs_correct_dir_path_form( path, d_pl_options, strip_prefix=True) blob_path = get_gcs_bucket(d_pl_options).blob( path_stripped_gcs_corrected) path_stripped_gcs_corrected_exists = blob_path.exists( gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected): {blob_path}, exists: {path_stripped_gcs_corrected_exists}" ) if path_stripped_gcs_corrected_exists: blob_path_delete_result = blob_path.delete(gcs_client) if debug: print( f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected)): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)" ) return blob_path_delete_result else: if debug: print( f"{'-'*(r_level)} out of options trying to delete base path {path}!" ) return False else: return FileSystems.delete([path])
def open_file_write(fpath): fs = FileSystems.get_filesystem(fpath) if type(fs) == GCSFileSystem: return gcsio.GcsIO().open(fpath, mode='w') else: return FileSystems.create(fpath)
def open_file_read(fpath): fs = FileSystems.get_filesystem(fpath) if type(fs) == GCSFileSystem: return gcsio.GcsIO().open(fpath) else: return FileSystems.open(fpath)