예제 #1
0
    def _check_state_for_finalize_write(self, writer_results, num_shards):
        """Checks writer output files' states.

    Returns:
      src_files, dst_files: Lists of files to rename. For each i, finalize_write
        should rename(src_files[i], dst_files[i]).
      delete_files: Src files to delete. These could be leftovers from an
        incomplete (non-atomic) rename operation.
      num_skipped: Tally of writer results files already renamed, such as from
        a previous run of finalize_write().
    """
        if not writer_results:
            return [], [], [], 0

        src_glob = FileSystems.join(
            FileSystems.split(writer_results[0])[0], '*')
        dst_glob = self._get_final_name_glob(num_shards)
        src_glob_files = set(file_metadata.path
                             for mr in FileSystems.match([src_glob])
                             for file_metadata in mr.metadata_list)
        dst_glob_files = set(file_metadata.path
                             for mr in FileSystems.match([dst_glob])
                             for file_metadata in mr.metadata_list)

        src_files = []
        dst_files = []
        delete_files = []
        num_skipped = 0
        for shard_num, src in enumerate(writer_results):
            final_name = self._get_final_name(shard_num, num_shards)
            dst = final_name
            src_exists = src in src_glob_files
            dst_exists = dst in dst_glob_files
            if not src_exists and not dst_exists:
                raise BeamIOError(
                    'src and dst files do not exist. src: %s, dst: %s' %
                    (src, dst))
            if not src_exists and dst_exists:
                logging.debug('src: %s -> dst: %s already renamed, skipping',
                              src, dst)
                num_skipped += 1
                continue
            if (src_exists and dst_exists and FileSystems.checksum(src)
                    == FileSystems.checksum(dst)):
                logging.debug('src: %s == dst: %s, deleting src', src, dst)
                delete_files.append(src)
                continue

            src_files.append(src)
            dst_files.append(dst)
        return src_files, dst_files, delete_files, num_skipped
예제 #2
0
  def _check_state_for_finalize_write(self, writer_results, num_shards):
    """Checks writer output files' states.

    Returns:
      src_files, dst_files: Lists of files to rename. For each i, finalize_write
        should rename(src_files[i], dst_files[i]).
      delete_files: Src files to delete. These could be leftovers from an
        incomplete (non-atomic) rename operation.
      num_skipped: Tally of writer results files already renamed, such as from
        a previous run of finalize_write().
    """
    if not writer_results:
      return [], [], [], 0

    src_glob = FileSystems.join(FileSystems.split(writer_results[0])[0], '*')
    dst_glob = self._get_final_name_glob(num_shards)
    src_glob_files = set(file_metadata.path
                         for mr in FileSystems.match([src_glob])
                         for file_metadata in mr.metadata_list)
    dst_glob_files = set(file_metadata.path
                         for mr in FileSystems.match([dst_glob])
                         for file_metadata in mr.metadata_list)

    src_files = []
    dst_files = []
    delete_files = []
    num_skipped = 0
    for shard_num, src in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      dst = final_name
      src_exists = src in src_glob_files
      dst_exists = dst in dst_glob_files
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)
    return src_files, dst_files, delete_files, num_skipped
예제 #3
0
  def finalize_write(self, init_result, writer_results,
                     unused_pre_finalize_results):
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)

    src_files = []
    dst_files = []
    delete_files = []
    chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
    num_skipped = 0
    for shard_num, shard in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      src = shard
      dst = final_name
      src_exists = FileSystems.exists(src)
      dst_exists = FileSystems.exists(dst)
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)

    num_skipped = len(delete_files)
    FileSystems.delete(delete_files)
    num_shards_to_finalize = len(src_files)
    min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_file_batch = [src_files[i:i + chunk_size]
                         for i in range(0, len(src_files), chunk_size)]
    destination_file_batch = [dst_files[i:i + chunk_size]
                              for i in range(0, len(dst_files), chunk_size)]

    if num_shards_to_finalize:
      logging.info(
          'Starting finalize_write threads with num_shards: %d (skipped: %d), '
          'batches: %d, num_threads: %d',
          num_shards_to_finalize, num_skipped, len(source_file_batch),
          num_threads)
      start_time = time.time()

      # Use a thread pool for renaming operations.
      def _rename_batch(batch):
        """_rename_batch executes batch rename operations."""
        source_files, destination_files = batch
        exceptions = []
        try:
          FileSystems.rename(source_files, destination_files)
          return exceptions
        except BeamIOError as exp:
          if exp.exception_details is None:
            raise
          for (src, dst), exception in exp.exception_details.iteritems():
            if exception:
              logging.error(('Exception in _rename_batch. src: %s, '
                             'dst: %s, err: %s'), src, dst, exception)
              exceptions.append(exception)
            else:
              logging.debug('Rename successful: %s -> %s', src, dst)
          return exceptions

      exception_batches = util.run_using_threadpool(
          _rename_batch, zip(source_file_batch, destination_file_batch),
          num_threads)

      all_exceptions = [e for exception_batch in exception_batches
                        for e in exception_batch]
      if all_exceptions:
        raise Exception(
            'Encountered exceptions in finalize_write: %s' % all_exceptions)

      for final_name in dst_files:
        yield final_name

      logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize,
                   time.time() - start_time)
    else:
      logging.warning(
          'No shards found to finalize. num_shards: %d, skipped: %d',
          num_shards, num_skipped)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass
예제 #4
0
    def finalize_write(self, init_result, writer_results,
                       unused_pre_finalize_results):
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)

        src_files = []
        dst_files = []
        delete_files = []
        chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
        num_skipped = 0
        for shard_num, shard in enumerate(writer_results):
            final_name = self._get_final_name(shard_num, num_shards)
            src = shard
            dst = final_name
            src_exists = FileSystems.exists(src)
            dst_exists = FileSystems.exists(dst)
            if not src_exists and not dst_exists:
                raise BeamIOError(
                    'src and dst files do not exist. src: %s, dst: %s' %
                    (src, dst))
            if not src_exists and dst_exists:
                logging.debug('src: %s -> dst: %s already renamed, skipping',
                              src, dst)
                num_skipped += 1
                continue
            if (src_exists and dst_exists and FileSystems.checksum(src)
                    == FileSystems.checksum(dst)):
                logging.debug('src: %s == dst: %s, deleting src', src, dst)
                delete_files.append(src)
                continue

            src_files.append(src)
            dst_files.append(dst)

        num_skipped = len(delete_files)
        FileSystems.delete(delete_files)
        num_shards_to_finalize = len(src_files)
        min_threads = min(num_shards_to_finalize,
                          FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        source_file_batch = [
            src_files[i:i + chunk_size]
            for i in range(0, len(src_files), chunk_size)
        ]
        destination_file_batch = [
            dst_files[i:i + chunk_size]
            for i in range(0, len(dst_files), chunk_size)
        ]

        if num_shards_to_finalize:
            logging.info(
                'Starting finalize_write threads with num_shards: %d (skipped: %d), '
                'batches: %d, num_threads: %d', num_shards_to_finalize,
                num_skipped, len(source_file_batch), num_threads)
            start_time = time.time()

            # Use a thread pool for renaming operations.
            def _rename_batch(batch):
                """_rename_batch executes batch rename operations."""
                source_files, destination_files = batch
                exceptions = []
                try:
                    FileSystems.rename(source_files, destination_files)
                    return exceptions
                except BeamIOError as exp:
                    if exp.exception_details is None:
                        raise
                    for (src,
                         dst), exception in exp.exception_details.iteritems():
                        if exception:
                            logging.error(
                                ('Exception in _rename_batch. src: %s, '
                                 'dst: %s, err: %s'), src, dst, exception)
                            exceptions.append(exception)
                        else:
                            logging.debug('Rename successful: %s -> %s', src,
                                          dst)
                    return exceptions

            exception_batches = util.run_using_threadpool(
                _rename_batch, zip(source_file_batch, destination_file_batch),
                num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]
            if all_exceptions:
                raise Exception(
                    'Encountered exceptions in finalize_write: %s' %
                    all_exceptions)

            for final_name in dst_files:
                yield final_name

            logging.info('Renamed %d shards in %.2f seconds.',
                         num_shards_to_finalize,
                         time.time() - start_time)
        else:
            logging.warning(
                'No shards found to finalize. num_shards: %d, skipped: %d',
                num_shards, num_skipped)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass