def write_processed_image(img_and_metadata, output_dir):
  """Encode the image as a png and write to google storage.

  Creates a function that will read processed images and save them to png
  files in directory output_dir.

  The output image filename is given by a unique index + '_' + randnum to
  the 3rd decimal place + '_' + label + '.png'. The index is also prefix-filled
  with zeros such that the index is at least length 6. This allows us to
  maintain consistent numeric and lexigraphical orderings of filenames by
  the index field up to 1 million images.

  Args:
    img_and_metadata: image, index, randnum, and label
    output_dir: output image directory
    cloud: whether to run/save images on cloud.

  Returns:
    [nothing] - just writes to file destination
  """

  # Construct image filename
  img_filename = (str(img_and_metadata[INDEX_KEY]).zfill(6) +
                  '_' +
                  '{0:.3f}'.format(img_and_metadata[RAND_KEY]).split('.')[1] +
                  '_' + str(img_and_metadata[LABEL_KEY]) +
                  '.png')

  # Encode image to png
  png_image = cv2.imencode('.png', img_and_metadata[IMAGE_KEY])[1].tostring()

  # Use beam.io.filesystems package to create local or gs file, and write image
  with FileSystems.create(output_dir + '/' + img_filename) as f:
    f.write(png_image)
예제 #2
0
  def open(self, temp_path):
    """Opens ``temp_path``, returning an opaque file handle object.

    The returned file handle is passed to ``write_[encoded_]record`` and
    ``close``.
    """
    return FileSystems.create(temp_path, self.mime_type, self.compression_type)
예제 #3
0
  def open(self, temp_path):
    """Opens ``temp_path``, returning an opaque file handle object.

    The returned file handle is passed to ``write_[encoded_]record`` and
    ``close``.
    """
    return FileSystems.create(temp_path, self.mime_type, self.compression_type)
예제 #4
0
 def create_txt_file(self, contents=b"hello world"):
     """Creates a txt file and returns its path."""
     bundle_uuid = str(random.random())
     bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/test.txt"
     with FileSystems.create(bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as f:
         f.write(contents)
     return bundle_uuid, bundle_path
예제 #5
0
    def create_directory(self):
        """Creates a directory (stored as a .tar.gz with an index.sqlite index file) and returns its path."""
        bundle_uuid = str(random.random())
        bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/contents.tar.gz"

        def writestr(tf, name, contents):
            tinfo = tarfile.TarInfo(name)
            tinfo.size = len(contents)
            tf.addfile(tinfo, BytesIO(contents.encode()))

        def writedir(tf, name):
            tinfo = tarfile.TarInfo(name)
            tinfo.type = tarfile.DIRTYPE
            tf.addfile(tinfo, BytesIO())

        # TODO: Unify this code with code in UploadManager.upload_to_bundle_store().
        with FileSystems.create(
                bundle_path, compression_type=CompressionTypes.UNCOMPRESSED
        ) as out, tempfile.NamedTemporaryFile(
                suffix=".tar.gz") as tmp_tar_file, tempfile.NamedTemporaryFile(
                    suffix=".sqlite") as tmp_index_file:
            with tarfile.open(name=tmp_tar_file.name, mode="w:gz") as tf:
                # We need to create separate entries for each directory, as a regular
                # .tar.gz file would have.
                writestr(tf, "./README.md", "hello world")
                writedir(tf, "./src")
                writestr(tf, "./src/test.sh", "echo hi")
                writedir(tf, "./dist")
                writedir(tf, "./dist/a")
                writedir(tf, "./dist/a/b")
                writestr(tf, "./dist/a/b/test2.sh", "echo two")
            shutil.copyfileobj(tmp_tar_file, out)
            with open(tmp_tar_file.name, "rb") as ttf:
                SQLiteIndexedTar(
                    fileObject=ttf,
                    tarFileName="contents",
                    writeIndex=True,
                    clearIndexCache=True,
                    indexFileName=tmp_index_file.name,
                )
            with FileSystems.create(
                    parse_linked_bundle_url(bundle_path).index_path,
                    compression_type=CompressionTypes.UNCOMPRESSED,
            ) as out_index_file, open(tmp_index_file.name, "rb") as tif:
                shutil.copyfileobj(tif, out_index_file)

        return bundle_uuid, bundle_path
예제 #6
0
 def _get_file_for_element(self, element):
     key_name = element[0]
     if key_name in self.open_files:
         return self.open_files[key_name], None
     file_path = self.get_path_for_key_name(key_name)
     file_handle = FileSystems.create(file_path, mime_type='text/json')
     self.open_files[key_name] = file_handle
     return file_handle, file_path
예제 #7
0
 def process(self, header):
   # type: (VcfHeader) -> None
   with FileSystems.create(self._file_path) as self._file_to_write:
     self._write_headers_by_type(HeaderTypeConstants.INFO, header.infos)
     self._write_headers_by_type(HeaderTypeConstants.FILTER, header.filters)
     self._write_headers_by_type(HeaderTypeConstants.ALT, header.alts)
     self._write_headers_by_type(HeaderTypeConstants.FORMAT, header.formats)
     self._write_headers_by_type(HeaderTypeConstants.CONTIG, header.contigs)
     self._file_to_write.write(self.FINAL_HEADER_LINE)
예제 #8
0
 def write_full(self, file_name, value):
   logging.getLogger(NAME).info('writing to: %s', file_name)
   file_handle = FileSystems.create(file_name, self.mime_type, self.compression_type)
   try:
     file_handle.write(value)
   finally:
     if file_handle is not None:
       file_handle.close()
   return file_name, value
예제 #9
0
def save_pages(output_filename, ext, bytes_by_page):
    mkdirs_if_not_exists(dirname(output_filename))
    with FileSystems.create(output_filename) as f:
        with ZipFile(f, 'w', compression=ZIP_DEFLATED) as zf:
            for i, data in enumerate(bytes_by_page):
                page_filename = 'page-%s%s' % (1 + i, ext)
                get_logger().debug('page_filename: %s', page_filename)
                zf.writestr(page_filename, data)
        return output_filename
예제 #10
0
 def test_should_get_error_line_using_compressed_beam_fs(
         self, temp_dir: Path):
     xml_file = temp_dir.joinpath('test.xml.gz')
     with FileSystems.create(str(xml_file)) as fp:
         fp.write(b'<xml>\n/xml>')
     try:
         parse_xml_or_get_error_line(
             lambda: FileSystems.open(str(xml_file)))
         assert False
     except XMLSyntaxErrorWithErrorLine as e:
         assert e.error_line == b'/xml>'
예제 #11
0
        def write_orphaned_file(temp_dir, writer_key):
            temp_dir_path = FileSystems.join(dir, temp_dir)

            file_prefix_dir = FileSystems.join(temp_dir_path,
                                               str(abs(hash(writer_key))))

            file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4())
            with FileSystems.create(file_name) as f:
                f.write(b'Hello y\'all')

            return file_name
예제 #12
0
def un_bz2_file(source, dest_path):
    """
    Unzips the source bz2 file object and writes the output to the file at
    dest_path
    """
    # Note, that we don't use bz2.BZ2File or the bunzip2 shell command since
    # they require the input file-like object to support either tell() or
    # fileno(). Our version requires only read() and close().

    BZ2_BUFFER_SIZE = 100 * 1024 * 1024  # Unzip in chunks of 100MB
    with FileSystems.create(dest_path, compression_type=CompressionTypes.UNCOMPRESSED) as dest:
        decompressor = bz2.BZ2Decompressor()
        for data in iter(lambda: source.read(BZ2_BUFFER_SIZE), b''):
            dest.write(decompressor.decompress(data))
예제 #13
0
 def create_file(self, contents=b"hello world"):
     """Creates a file on Blob (stored as a .gz with an index.sqlite index file) and returns its path."""
     bundle_uuid = str(random.random())
     bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/contents.gz"
     compressed_file = BytesIO(gzip.compress(contents))
     # TODO: Unify this code with code in BlobStorageUploader.write_fileobj().
     with FileSystems.create(bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as f:
         shutil.copyfileobj(compressed_file, f)
     compressed_file.seek(0)
     with tempfile.NamedTemporaryFile(suffix=".sqlite") as tmp_index_file:
         SQLiteIndexedTar(
             fileObject=compressed_file,
             tarFileName="contents",  # If saving a single file as a .gz archive, this file can be accessed by the "/contents" entry in the index.
             writeIndex=True,
             clearIndexCache=True,
             indexFilePath=tmp_index_file.name,
         )
         with FileSystems.create(
             parse_linked_bundle_url(bundle_path).index_path,
             compression_type=CompressionTypes.UNCOMPRESSED,
         ) as out_index_file, open(tmp_index_file.name, "rb") as tif:
             shutil.copyfileobj(tif, out_index_file)
     return bundle_uuid, bundle_path
예제 #14
0
 def process(self, header, vcf_version_line=None):
     # type: (VcfHeader, str) -> None
     with FileSystems.create(self._file_path) as self._file_to_write:
         if vcf_version_line:
             self._file_to_write.write(vcf_version_line.encode('utf-8'))
         self._write_headers_by_type(HeaderTypeConstants.INFO, header.infos)
         self._write_headers_by_type(HeaderTypeConstants.FILTER,
                                     header.filters)
         self._write_headers_by_type(HeaderTypeConstants.ALT, header.alts)
         self._write_headers_by_type(HeaderTypeConstants.FORMAT,
                                     header.formats)
         self._write_headers_by_type(HeaderTypeConstants.CONTIG,
                                     header.contigs)
         self._file_to_write.write(self.FINAL_HEADER_LINE)
예제 #15
0
def write_to_directory(img_and_metadata, dst_dir):
    """Write the serialized image data (png) to dst_dir/filename.

  Filename is the original filename of the image.

  Args:
    img_and_metadata: filename, randnum, and serialized image data (png)
    dst_dir: output directory

  Returns:
    [nothing] - this component serves as a sink
  """
    source = img_and_metadata[FILENAME_KEY]
    with FileSystems.create(dst_dir + FileSystems.split(source)[1]) as f:
        f.write(img_and_metadata[IMAGE_KEY])
예제 #16
0
def open_file_write(fpath):
    fs = FileSystems.get_filesystem(fpath)
    if type(fs) == GCSFileSystem:
        return gcsio.GcsIO().open(fpath, mode='w')
    else:
        return FileSystems.create(fpath)
예제 #17
0
def save_plain_file_list(file_list_path, file_list):
    with FileSystems.create(file_list_path) as f:
        f.write('\n'.join(file_list).encode('utf-8'))
예제 #18
0
def save_file_content(output_filename, data):
    mkdirs_if_not_exists(dirname(output_filename))
    # Note: FileSystems.create transparently handles compression based on the file extension
    with FileSystems.create(output_filename) as f:
        f.write(data)
    return output_filename
예제 #19
0
파일: fileio_test.py 프로젝트: mahak/beam
 def _create_extra_file(element):
     writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
     writer.close()
     return element.path
예제 #20
0
    def write_fileobj(
        self,
        source_ext: str,
        source_fileobj: IO[bytes],
        bundle_path: str,
        unpack_archive: bool,
        bundle_conn_str=None,
        index_conn_str=None,
        progress_callback=None,
    ):
        if unpack_archive:
            output_fileobj = zip_util.unpack_to_archive(
                source_ext, source_fileobj)
        else:
            output_fileobj = GzipStream(source_fileobj)

        # Write archive file.
        if bundle_conn_str is not None:
            conn_str = os.environ.get('AZURE_STORAGE_CONNECTION_STRING', '')
            os.environ['AZURE_STORAGE_CONNECTION_STRING'] = bundle_conn_str
        try:
            bytes_uploaded = 0
            CHUNK_SIZE = 16 * 1024
            with FileSystems.create(
                    bundle_path,
                    compression_type=CompressionTypes.UNCOMPRESSED) as out:
                while True:
                    to_send = output_fileobj.read(CHUNK_SIZE)
                    if not to_send:
                        break
                    out.write(to_send)
                    bytes_uploaded += len(to_send)
                    if progress_callback is not None:
                        should_resume = progress_callback(bytes_uploaded)
                        if not should_resume:
                            raise Exception('Upload aborted by client')

            with FileSystems.open(
                    bundle_path, compression_type=CompressionTypes.UNCOMPRESSED
            ) as ttf, tempfile.NamedTemporaryFile(
                    suffix=".sqlite") as tmp_index_file:
                SQLiteIndexedTar(
                    fileObject=ttf,
                    tarFileName=
                    "contents",  # If saving a single file as a .gz archive, this file can be accessed by the "/contents" entry in the index.
                    writeIndex=True,
                    clearIndexCache=True,
                    indexFilePath=tmp_index_file.name,
                )
                if bundle_conn_str is not None:
                    os.environ[
                        'AZURE_STORAGE_CONNECTION_STRING'] = index_conn_str
                with FileSystems.create(
                        parse_linked_bundle_url(bundle_path).index_path,
                        compression_type=CompressionTypes.UNCOMPRESSED,
                ) as out_index_file, open(tmp_index_file.name, "rb") as tif:
                    while True:
                        to_send = tif.read(CHUNK_SIZE)
                        if not to_send:
                            break
                        out_index_file.write(to_send)
                        bytes_uploaded += len(to_send)
                        if progress_callback is not None:
                            should_resume = progress_callback(bytes_uploaded)
                            if not should_resume:
                                raise Exception('Upload aborted by client')
        except Exception as err:
            raise err
        finally:  # restore the origin connection string
            if bundle_conn_str is not None:
                os.environ[
                    'AZURE_STORAGE_CONNECTION_STRING'] = conn_str if conn_str != '' else None  # type: ignore