def write_processed_image(img_and_metadata, output_dir): """Encode the image as a png and write to google storage. Creates a function that will read processed images and save them to png files in directory output_dir. The output image filename is given by a unique index + '_' + randnum to the 3rd decimal place + '_' + label + '.png'. The index is also prefix-filled with zeros such that the index is at least length 6. This allows us to maintain consistent numeric and lexigraphical orderings of filenames by the index field up to 1 million images. Args: img_and_metadata: image, index, randnum, and label output_dir: output image directory cloud: whether to run/save images on cloud. Returns: [nothing] - just writes to file destination """ # Construct image filename img_filename = (str(img_and_metadata[INDEX_KEY]).zfill(6) + '_' + '{0:.3f}'.format(img_and_metadata[RAND_KEY]).split('.')[1] + '_' + str(img_and_metadata[LABEL_KEY]) + '.png') # Encode image to png png_image = cv2.imencode('.png', img_and_metadata[IMAGE_KEY])[1].tostring() # Use beam.io.filesystems package to create local or gs file, and write image with FileSystems.create(output_dir + '/' + img_filename) as f: f.write(png_image)
def open(self, temp_path): """Opens ``temp_path``, returning an opaque file handle object. The returned file handle is passed to ``write_[encoded_]record`` and ``close``. """ return FileSystems.create(temp_path, self.mime_type, self.compression_type)
def create_txt_file(self, contents=b"hello world"): """Creates a txt file and returns its path.""" bundle_uuid = str(random.random()) bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/test.txt" with FileSystems.create(bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as f: f.write(contents) return bundle_uuid, bundle_path
def create_directory(self): """Creates a directory (stored as a .tar.gz with an index.sqlite index file) and returns its path.""" bundle_uuid = str(random.random()) bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/contents.tar.gz" def writestr(tf, name, contents): tinfo = tarfile.TarInfo(name) tinfo.size = len(contents) tf.addfile(tinfo, BytesIO(contents.encode())) def writedir(tf, name): tinfo = tarfile.TarInfo(name) tinfo.type = tarfile.DIRTYPE tf.addfile(tinfo, BytesIO()) # TODO: Unify this code with code in UploadManager.upload_to_bundle_store(). with FileSystems.create( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED ) as out, tempfile.NamedTemporaryFile( suffix=".tar.gz") as tmp_tar_file, tempfile.NamedTemporaryFile( suffix=".sqlite") as tmp_index_file: with tarfile.open(name=tmp_tar_file.name, mode="w:gz") as tf: # We need to create separate entries for each directory, as a regular # .tar.gz file would have. writestr(tf, "./README.md", "hello world") writedir(tf, "./src") writestr(tf, "./src/test.sh", "echo hi") writedir(tf, "./dist") writedir(tf, "./dist/a") writedir(tf, "./dist/a/b") writestr(tf, "./dist/a/b/test2.sh", "echo two") shutil.copyfileobj(tmp_tar_file, out) with open(tmp_tar_file.name, "rb") as ttf: SQLiteIndexedTar( fileObject=ttf, tarFileName="contents", writeIndex=True, clearIndexCache=True, indexFileName=tmp_index_file.name, ) with FileSystems.create( parse_linked_bundle_url(bundle_path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ) as out_index_file, open(tmp_index_file.name, "rb") as tif: shutil.copyfileobj(tif, out_index_file) return bundle_uuid, bundle_path
def _get_file_for_element(self, element): key_name = element[0] if key_name in self.open_files: return self.open_files[key_name], None file_path = self.get_path_for_key_name(key_name) file_handle = FileSystems.create(file_path, mime_type='text/json') self.open_files[key_name] = file_handle return file_handle, file_path
def process(self, header): # type: (VcfHeader) -> None with FileSystems.create(self._file_path) as self._file_to_write: self._write_headers_by_type(HeaderTypeConstants.INFO, header.infos) self._write_headers_by_type(HeaderTypeConstants.FILTER, header.filters) self._write_headers_by_type(HeaderTypeConstants.ALT, header.alts) self._write_headers_by_type(HeaderTypeConstants.FORMAT, header.formats) self._write_headers_by_type(HeaderTypeConstants.CONTIG, header.contigs) self._file_to_write.write(self.FINAL_HEADER_LINE)
def write_full(self, file_name, value): logging.getLogger(NAME).info('writing to: %s', file_name) file_handle = FileSystems.create(file_name, self.mime_type, self.compression_type) try: file_handle.write(value) finally: if file_handle is not None: file_handle.close() return file_name, value
def save_pages(output_filename, ext, bytes_by_page): mkdirs_if_not_exists(dirname(output_filename)) with FileSystems.create(output_filename) as f: with ZipFile(f, 'w', compression=ZIP_DEFLATED) as zf: for i, data in enumerate(bytes_by_page): page_filename = 'page-%s%s' % (1 + i, ext) get_logger().debug('page_filename: %s', page_filename) zf.writestr(page_filename, data) return output_filename
def test_should_get_error_line_using_compressed_beam_fs( self, temp_dir: Path): xml_file = temp_dir.joinpath('test.xml.gz') with FileSystems.create(str(xml_file)) as fp: fp.write(b'<xml>\n/xml>') try: parse_xml_or_get_error_line( lambda: FileSystems.open(str(xml_file))) assert False except XMLSyntaxErrorWithErrorLine as e: assert e.error_line == b'/xml>'
def write_orphaned_file(temp_dir, writer_key): temp_dir_path = FileSystems.join(dir, temp_dir) file_prefix_dir = FileSystems.join(temp_dir_path, str(abs(hash(writer_key)))) file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4()) with FileSystems.create(file_name) as f: f.write(b'Hello y\'all') return file_name
def un_bz2_file(source, dest_path): """ Unzips the source bz2 file object and writes the output to the file at dest_path """ # Note, that we don't use bz2.BZ2File or the bunzip2 shell command since # they require the input file-like object to support either tell() or # fileno(). Our version requires only read() and close(). BZ2_BUFFER_SIZE = 100 * 1024 * 1024 # Unzip in chunks of 100MB with FileSystems.create(dest_path, compression_type=CompressionTypes.UNCOMPRESSED) as dest: decompressor = bz2.BZ2Decompressor() for data in iter(lambda: source.read(BZ2_BUFFER_SIZE), b''): dest.write(decompressor.decompress(data))
def create_file(self, contents=b"hello world"): """Creates a file on Blob (stored as a .gz with an index.sqlite index file) and returns its path.""" bundle_uuid = str(random.random()) bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/contents.gz" compressed_file = BytesIO(gzip.compress(contents)) # TODO: Unify this code with code in BlobStorageUploader.write_fileobj(). with FileSystems.create(bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as f: shutil.copyfileobj(compressed_file, f) compressed_file.seek(0) with tempfile.NamedTemporaryFile(suffix=".sqlite") as tmp_index_file: SQLiteIndexedTar( fileObject=compressed_file, tarFileName="contents", # If saving a single file as a .gz archive, this file can be accessed by the "/contents" entry in the index. writeIndex=True, clearIndexCache=True, indexFilePath=tmp_index_file.name, ) with FileSystems.create( parse_linked_bundle_url(bundle_path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ) as out_index_file, open(tmp_index_file.name, "rb") as tif: shutil.copyfileobj(tif, out_index_file) return bundle_uuid, bundle_path
def process(self, header, vcf_version_line=None): # type: (VcfHeader, str) -> None with FileSystems.create(self._file_path) as self._file_to_write: if vcf_version_line: self._file_to_write.write(vcf_version_line.encode('utf-8')) self._write_headers_by_type(HeaderTypeConstants.INFO, header.infos) self._write_headers_by_type(HeaderTypeConstants.FILTER, header.filters) self._write_headers_by_type(HeaderTypeConstants.ALT, header.alts) self._write_headers_by_type(HeaderTypeConstants.FORMAT, header.formats) self._write_headers_by_type(HeaderTypeConstants.CONTIG, header.contigs) self._file_to_write.write(self.FINAL_HEADER_LINE)
def write_to_directory(img_and_metadata, dst_dir): """Write the serialized image data (png) to dst_dir/filename. Filename is the original filename of the image. Args: img_and_metadata: filename, randnum, and serialized image data (png) dst_dir: output directory Returns: [nothing] - this component serves as a sink """ source = img_and_metadata[FILENAME_KEY] with FileSystems.create(dst_dir + FileSystems.split(source)[1]) as f: f.write(img_and_metadata[IMAGE_KEY])
def open_file_write(fpath): fs = FileSystems.get_filesystem(fpath) if type(fs) == GCSFileSystem: return gcsio.GcsIO().open(fpath, mode='w') else: return FileSystems.create(fpath)
def save_plain_file_list(file_list_path, file_list): with FileSystems.create(file_list_path) as f: f.write('\n'.join(file_list).encode('utf-8'))
def save_file_content(output_filename, data): mkdirs_if_not_exists(dirname(output_filename)) # Note: FileSystems.create transparently handles compression based on the file extension with FileSystems.create(output_filename) as f: f.write(data) return output_filename
def _create_extra_file(element): writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() return element.path
def write_fileobj( self, source_ext: str, source_fileobj: IO[bytes], bundle_path: str, unpack_archive: bool, bundle_conn_str=None, index_conn_str=None, progress_callback=None, ): if unpack_archive: output_fileobj = zip_util.unpack_to_archive( source_ext, source_fileobj) else: output_fileobj = GzipStream(source_fileobj) # Write archive file. if bundle_conn_str is not None: conn_str = os.environ.get('AZURE_STORAGE_CONNECTION_STRING', '') os.environ['AZURE_STORAGE_CONNECTION_STRING'] = bundle_conn_str try: bytes_uploaded = 0 CHUNK_SIZE = 16 * 1024 with FileSystems.create( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as out: while True: to_send = output_fileobj.read(CHUNK_SIZE) if not to_send: break out.write(to_send) bytes_uploaded += len(to_send) if progress_callback is not None: should_resume = progress_callback(bytes_uploaded) if not should_resume: raise Exception('Upload aborted by client') with FileSystems.open( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED ) as ttf, tempfile.NamedTemporaryFile( suffix=".sqlite") as tmp_index_file: SQLiteIndexedTar( fileObject=ttf, tarFileName= "contents", # If saving a single file as a .gz archive, this file can be accessed by the "/contents" entry in the index. writeIndex=True, clearIndexCache=True, indexFilePath=tmp_index_file.name, ) if bundle_conn_str is not None: os.environ[ 'AZURE_STORAGE_CONNECTION_STRING'] = index_conn_str with FileSystems.create( parse_linked_bundle_url(bundle_path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ) as out_index_file, open(tmp_index_file.name, "rb") as tif: while True: to_send = tif.read(CHUNK_SIZE) if not to_send: break out_index_file.write(to_send) bytes_uploaded += len(to_send) if progress_callback is not None: should_resume = progress_callback(bytes_uploaded) if not should_resume: raise Exception('Upload aborted by client') except Exception as err: raise err finally: # restore the origin connection string if bundle_conn_str is not None: os.environ[ 'AZURE_STORAGE_CONNECTION_STRING'] = conn_str if conn_str != '' else None # type: ignore