예제 #1
0
    def __init__(self,
                 coder=coders.BytesCoder(),
                 compression_type=CompressionTypes.AUTO,
                 **kwargs):
        """Initialize the ``ReadAllFromTFRecord`` transform.

    Args:
      coder: Coder used to decode each record.
      compression_type: Used to handle compressed input files. Default value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.
      **kwargs: optional args dictionary. These are passed through to parent
        constructor.
    """
        super(ReadAllFromTFRecord, self).__init__(**kwargs)
        source_from_file = partial(_create_tfrcordio_source,
                                   compression_type=compression_type,
                                   coder=coder)
        # Desired and min bundle sizes do not matter since TFRecord files are
        # unsplittable.
        self._read_all_files = ReadAllFiles(splittable=False,
                                            compression_type=compression_type,
                                            desired_bundle_size=0,
                                            min_bundle_size=0,
                                            source_from_file=source_from_file)
예제 #2
0
    def __init__(self,
                 coder=coders.BytesCoder(),
                 compression_type=CompressionTypes.AUTO,
                 with_filename=False):
        """Initialize the ``ReadAllFromTFRecord`` transform.

    Args:
      coder: Coder used to decode each record.
      compression_type: Used to handle compressed input files. Default value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.
      with_filename: If True, returns a Key Value with the key being the file
        name and the value being the actual data. If False, it only returns
        the data.
    """
        super(ReadAllFromTFRecord, self).__init__()
        source_from_file = partial(_create_tfrecordio_source,
                                   compression_type=compression_type,
                                   coder=coder)
        # Desired and min bundle sizes do not matter since TFRecord files are
        # unsplittable.
        self._read_all_files = ReadAllFiles(splittable=False,
                                            compression_type=compression_type,
                                            desired_bundle_size=0,
                                            min_bundle_size=0,
                                            source_from_file=source_from_file,
                                            with_filename=with_filename)
예제 #3
0
파일: textio.py 프로젝트: nielm/beam
    def __init__(
            self,
            min_bundle_size=0,
            desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE,
            compression_type=CompressionTypes.AUTO,
            strip_trailing_newlines=True,
            coder=coders.StrUtf8Coder(),  # type: coders.Coder
            skip_header_lines=0,
            with_filename=False,
            delimiter=None,
            escapechar=None,
            **kwargs):
        """Initialize the ``ReadAllFromText`` transform.

    Args:
      min_bundle_size: Minimum size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      desired_bundle_size: Desired size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      compression_type: Used to handle compressed input files. Typical value
        is ``CompressionTypes.AUTO``, in which case the underlying file_path's
        extension will be used to detect the compression.
      strip_trailing_newlines: Indicates whether this source should remove
        the newline char in each line it reads before decoding that line.
      validate: flag to verify that the files exist during the pipeline
        creation time.
      skip_header_lines: Number of header lines to skip. Same number is skipped
        from each source file. Must be 0 or higher. Large number of skipped
        lines might impact performance.
      coder: Coder used to decode each line.
      with_filename: If True, returns a Key Value with the key being the file
        name and the value being the actual data. If False, it only returns
        the data.
      delimiter (bytes) Optional: delimiter to split records.
        Must not self-overlap, because self-overlapping delimiters cause
        ambiguous parsing.
      escapechar (bytes) Optional: a single byte to escape the records
        delimiter, can also escape itself.
    """
        super().__init__(**kwargs)
        source_from_file = partial(
            _create_text_source,
            min_bundle_size=min_bundle_size,
            compression_type=compression_type,
            strip_trailing_newlines=strip_trailing_newlines,
            coder=coder,
            skip_header_lines=skip_header_lines,
            delimiter=delimiter,
            escapechar=escapechar)
        self._desired_bundle_size = desired_bundle_size
        self._min_bundle_size = min_bundle_size
        self._compression_type = compression_type
        self._read_all_files = ReadAllFiles(True, compression_type,
                                            desired_bundle_size,
                                            min_bundle_size, source_from_file,
                                            with_filename)
예제 #4
0
파일: textio.py 프로젝트: stallyNon/beam
  def __init__(
      self,
      min_bundle_size=0,
      desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE,
      compression_type=CompressionTypes.AUTO,
      strip_trailing_newlines=True,
      coder=coders.StrUtf8Coder(),  # type: coders.Coder
      skip_header_lines=0,
      **kwargs):
    """Initialize the ``ReadAllFromText`` transform.

    Args:
      min_bundle_size: Minimum size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      desired_bundle_size: Desired size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      compression_type: Used to handle compressed input files. Typical value
        is ``CompressionTypes.AUTO``, in which case the underlying file_path's
        extension will be used to detect the compression.
      strip_trailing_newlines: Indicates whether this source should remove
        the newline char in each line it reads before decoding that line.
      validate: flag to verify that the files exist during the pipeline
        creation time.
      skip_header_lines: Number of header lines to skip. Same number is skipped
        from each source file. Must be 0 or higher. Large number of skipped
        lines might impact performance.
      coder: Coder used to decode each line.
    """
    super(ReadAllFromText, self).__init__(**kwargs)
    source_from_file = partial(
        _create_text_source,
        min_bundle_size=min_bundle_size,
        compression_type=compression_type,
        strip_trailing_newlines=strip_trailing_newlines,
        coder=coder,
        skip_header_lines=skip_header_lines)
    self._desired_bundle_size = desired_bundle_size
    self._min_bundle_size = min_bundle_size
    self._compression_type = compression_type
    self._read_all_files = ReadAllFiles(
        True,
        compression_type,
        desired_bundle_size,
        min_bundle_size,
        source_from_file)