예제 #1
0
  def _add_compression(stream, path, mime_type, compression_type):
    if mime_type != 'application/octet-stream':
      logging.warning('Mime types are not supported. Got non-default mime_type:'
                      ' %s', mime_type)
    if compression_type == CompressionTypes.AUTO:
      compression_type = CompressionTypes.detect_compression_type(path)
    if compression_type != CompressionTypes.UNCOMPRESSED:
      return CompressedFile(stream)

    return stream
예제 #2
0
  def _add_compression(stream, path, mime_type, compression_type):
    if mime_type != 'application/octet-stream':
      logging.warning('Mime types are not supported. Got non-default mime_type:'
                      ' %s', mime_type)
    if compression_type == CompressionTypes.AUTO:
      compression_type = CompressionTypes.detect_compression_type(path)
    if compression_type != CompressionTypes.UNCOMPRESSED:
      return CompressedFile(stream)

    return stream
예제 #3
0
 def _open_hdfs(self, path, mode, mime_type, compression_type):
   if mime_type != 'application/octet-stream':
     logging.warning('Mime types are not supported. Got non-default mime_type:'
                     ' %s', mime_type)
   if compression_type == CompressionTypes.AUTO:
     compression_type = CompressionTypes.detect_compression_type(path)
   res = self._hdfs_client.open(path, mode)
   if compression_type != CompressionTypes.UNCOMPRESSED:
     res = CompressedFile(res)
   return res
예제 #4
0
 def _open_hdfs(self, path, mode, mime_type, compression_type):
     if mime_type != 'application/octet-stream':
         logging.warning(
             'Mime types are not supported. Got non-default mime_type:'
             ' %s', mime_type)
     if compression_type == CompressionTypes.AUTO:
         compression_type = CompressionTypes.detect_compression_type(path)
     res = self._hdfs_client.open(path, mode)
     if compression_type != CompressionTypes.UNCOMPRESSED:
         res = CompressedFile(res)
     return res
예제 #5
0
    def _get_concat_source(self):
        if self._concat_source is None:
            pattern = self._pattern.get()

            single_file_sources = []
            if self._file_system is None:
                self._file_system = get_filesystem(pattern)
            match_result = self._file_system.match([pattern])[0]
            files_metadata = match_result.metadata_list

            # We create a reference for FileBasedSource that will be serialized along
            # with each _SingleFileSource. To prevent this FileBasedSource from having
            # a reference to ConcatSource (resulting in quadratic space complexity)
            # we clone it here.
            file_based_source_ref = pickler.loads(pickler.dumps(self))

            for file_metadata in files_metadata:
                file_name = file_metadata.path
                file_size = file_metadata.size_in_bytes
                if file_size == 0:
                    continue  # Ignoring empty file.

                # We determine splittability of this specific file.
                splittable = self.splittable
                if (splittable
                        and self._compression_type == CompressionTypes.AUTO):
                    compression_type = CompressionTypes.detect_compression_type(
                        file_name)
                    if compression_type != CompressionTypes.UNCOMPRESSED:
                        splittable = False

                single_file_source = _SingleFileSource(
                    file_based_source_ref,
                    file_name,
                    0,
                    file_size,
                    min_bundle_size=self._min_bundle_size,
                    splittable=splittable)
                single_file_sources.append(single_file_source)
            self._concat_source = concat_source.ConcatSource(
                single_file_sources)
        return self._concat_source
예제 #6
0
  def _get_concat_source(self):
    if self._concat_source is None:
      pattern = self._pattern.get()

      single_file_sources = []
      if self._file_system is None:
        self._file_system = get_filesystem(pattern)
      match_result = self._file_system.match([pattern])[0]
      files_metadata = match_result.metadata_list

      # We create a reference for FileBasedSource that will be serialized along
      # with each _SingleFileSource. To prevent this FileBasedSource from having
      # a reference to ConcatSource (resulting in quadratic space complexity)
      # we clone it here.
      file_based_source_ref = pickler.loads(pickler.dumps(self))

      for file_metadata in files_metadata:
        file_name = file_metadata.path
        file_size = file_metadata.size_in_bytes
        if file_size == 0:
          continue  # Ignoring empty file.

        # We determine splittability of this specific file.
        splittable = self.splittable
        if (splittable and
            self._compression_type == CompressionTypes.AUTO):
          compression_type = CompressionTypes.detect_compression_type(
              file_name)
          if compression_type != CompressionTypes.UNCOMPRESSED:
            splittable = False

        single_file_source = _SingleFileSource(
            file_based_source_ref, file_name,
            0,
            file_size,
            min_bundle_size=self._min_bundle_size,
            splittable=splittable)
        single_file_sources.append(single_file_source)
      self._concat_source = concat_source.ConcatSource(single_file_sources)
    return self._concat_source
예제 #7
0
def _determine_splittability_from_compression_type(file_path,
                                                   compression_type):
    if compression_type == CompressionTypes.AUTO:
        compression_type = CompressionTypes.detect_compression_type(file_path)

    return compression_type == CompressionTypes.UNCOMPRESSED
예제 #8
0
def _determine_splittability_from_compression_type(
    file_path, compression_type):
  if compression_type == CompressionTypes.AUTO:
    compression_type = CompressionTypes.detect_compression_type(file_path)

  return compression_type == CompressionTypes.UNCOMPRESSED