def finalize_write(self, init_result, writer_results, pre_finalize_result):
        file_path_prefix = self.file_path_prefix.get()

        shard_paths = it.chain.from_iterable(writer_results)
        path_pairs = list(self._source_dest_shard_pairs(shard_paths))
        unique_dest_dirs = {pp.split(pair[1])[0] for pair in path_pairs}

        num_shards = len(path_pairs)
        min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        batch_size = FileSystems.get_chunk_size(file_path_prefix)
        batches = [
            path_pairs[i:i + batch_size]
            for i in six.moves.range(0, len(path_pairs), batch_size)
        ]

        logging.info(
            'Starting finalize_write threads with num_shards: %d, '
            'batches: %d, num_threads: %d', num_shards, len(batches),
            num_threads)
        start_time = time.time()

        if unique_dest_dirs:
            # Fix #18 run_using_threadpool raises if you pass in an empty list of inputs
            # so if we don't have any work to do, then just skip it
            util.run_using_threadpool(self._create_output_dir,
                                      unique_dest_dirs, num_threads)

            exception_batches = util.run_using_threadpool(
                self._rename_batch, batches, num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]

            if all_exceptions:
                raise Exception('Encountered exceptions in finalize_write: %s',
                                all_exceptions)

        for _, final_name in path_pairs:
            yield final_name

        logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                     time.time() - start_time)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass
Esempio n. 2
0
    def finalize_write(self, init_result, writer_results,
                       unused_pre_finalize_results):
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)

        src_files, dst_files, delete_files, num_skipped = (
            self._check_state_for_finalize_write(writer_results, num_shards))
        num_skipped += len(delete_files)
        FileSystems.delete(delete_files)
        num_shards_to_finalize = len(src_files)
        min_threads = min(num_shards_to_finalize,
                          FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
        source_file_batch = [
            src_files[i:i + chunk_size]
            for i in range(0, len(src_files), chunk_size)
        ]
        destination_file_batch = [
            dst_files[i:i + chunk_size]
            for i in range(0, len(dst_files), chunk_size)
        ]

        if num_shards_to_finalize:
            logging.info(
                'Starting finalize_write threads with num_shards: %d (skipped: %d), '
                'batches: %d, num_threads: %d', num_shards_to_finalize,
                num_skipped, len(source_file_batch), num_threads)
            start_time = time.time()

            # Use a thread pool for renaming operations.
            def _rename_batch(batch):
                """_rename_batch executes batch rename operations."""
                source_files, destination_files = batch
                exceptions = []
                try:
                    FileSystems.rename(source_files, destination_files)
                    return exceptions
                except BeamIOError as exp:
                    if exp.exception_details is None:
                        raise
                    for (src,
                         dst), exception in iteritems(exp.exception_details):
                        if exception:
                            logging.error(
                                ('Exception in _rename_batch. src: %s, '
                                 'dst: %s, err: %s'), src, dst, exception)
                            exceptions.append(exception)
                        else:
                            logging.debug('Rename successful: %s -> %s', src,
                                          dst)
                    return exceptions

            exception_batches = util.run_using_threadpool(
                _rename_batch,
                list(zip(source_file_batch, destination_file_batch)),
                num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]
            if all_exceptions:
                raise Exception(
                    'Encountered exceptions in finalize_write: %s' %
                    all_exceptions)

            for final_name in dst_files:
                yield final_name

            logging.info('Renamed %d shards in %.2f seconds.',
                         num_shards_to_finalize,
                         time.time() - start_time)
        else:
            logging.warning(
                'No shards found to finalize. num_shards: %d, skipped: %d',
                num_shards, num_skipped)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass
Esempio n. 3
0
    def finalize_write(self, init_result, writer_results):
        file_path_prefix = self.file_path_prefix.get()
        file_name_suffix = self.file_name_suffix.get()
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)
        min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        source_files = []
        destination_files = []
        chunk_size = FileSystems.get_chunk_size(file_path_prefix)
        for shard_num, shard in enumerate(writer_results):
            final_name = ''.join([
                file_path_prefix, self.shard_name_format %
                dict(shard_num=shard_num, num_shards=num_shards),
                file_name_suffix
            ])
            source_files.append(shard)
            destination_files.append(final_name)

        source_file_batch = [
            source_files[i:i + chunk_size]
            for i in range(0, len(source_files), chunk_size)
        ]
        destination_file_batch = [
            destination_files[i:i + chunk_size]
            for i in range(0, len(destination_files), chunk_size)
        ]

        logging.info(
            'Starting finalize_write threads with num_shards: %d, '
            'batches: %d, num_threads: %d', num_shards, len(source_file_batch),
            num_threads)
        start_time = time.time()

        # Use a thread pool for renaming operations.
        def _rename_batch(batch):
            """_rename_batch executes batch rename operations."""
            source_files, destination_files = batch
            exceptions = []
            try:
                FileSystems.rename(source_files, destination_files)
                return exceptions
            except BeamIOError as exp:
                if exp.exception_details is None:
                    raise
                for (src,
                     dest), exception in exp.exception_details.iteritems():
                    if exception:
                        logging.warning('Rename not successful: %s -> %s, %s',
                                        src, dest, exception)
                        should_report = True
                        if isinstance(exception, IOError):
                            # May have already been copied.
                            try:
                                if FileSystems.exists(dest):
                                    should_report = False
                            except Exception as exists_e:  # pylint: disable=broad-except
                                logging.warning(
                                    'Exception when checking if file %s exists: '
                                    '%s', dest, exists_e)
                        if should_report:
                            logging.warning(
                                ('Exception in _rename_batch. src: %s, '
                                 'dest: %s, err: %s'), src, dest, exception)
                            exceptions.append(exception)
                    else:
                        logging.debug('Rename successful: %s -> %s', src, dest)
                return exceptions

        exception_batches = util.run_using_threadpool(
            _rename_batch, zip(source_file_batch, destination_file_batch),
            num_threads)

        all_exceptions = [
            e for exception_batch in exception_batches for e in exception_batch
        ]
        if all_exceptions:
            raise Exception('Encountered exceptions in finalize_write: %s' %
                            all_exceptions)

        for final_name in destination_files:
            yield final_name

        logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                     time.time() - start_time)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass
Esempio n. 4
0
  def finalize_write(self, init_result, writer_results):
    file_path_prefix = self.file_path_prefix.get()
    file_name_suffix = self.file_name_suffix.get()
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)
    min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_files = []
    destination_files = []
    chunk_size = FileSystems.get_chunk_size(file_path_prefix)
    for shard_num, shard in enumerate(writer_results):
      final_name = ''.join([
          file_path_prefix, self.shard_name_format % dict(
              shard_num=shard_num, num_shards=num_shards), file_name_suffix
      ])
      source_files.append(shard)
      destination_files.append(final_name)

    source_file_batch = [source_files[i:i + chunk_size]
                         for i in xrange(0, len(source_files),
                                         chunk_size)]
    destination_file_batch = [destination_files[i:i + chunk_size]
                              for i in xrange(0, len(destination_files),
                                              chunk_size)]

    logging.info(
        'Starting finalize_write threads with num_shards: %d, '
        'batches: %d, num_threads: %d',
        num_shards, len(source_file_batch), num_threads)
    start_time = time.time()

    # Use a thread pool for renaming operations.
    def _rename_batch(batch):
      """_rename_batch executes batch rename operations."""
      source_files, destination_files = batch
      exceptions = []
      try:
        FileSystems.rename(source_files, destination_files)
        return exceptions
      except BeamIOError as exp:
        if exp.exception_details is None:
          raise
        for (src, dest), exception in exp.exception_details.iteritems():
          if exception:
            logging.warning('Rename not successful: %s -> %s, %s', src, dest,
                            exception)
            should_report = True
            if isinstance(exception, IOError):
              # May have already been copied.
              try:
                if FileSystems.exists(dest):
                  should_report = False
              except Exception as exists_e:  # pylint: disable=broad-except
                logging.warning('Exception when checking if file %s exists: '
                                '%s', dest, exists_e)
            if should_report:
              logging.warning(('Exception in _rename_batch. src: %s, '
                               'dest: %s, err: %s'), src, dest, exception)
              exceptions.append(exception)
          else:
            logging.debug('Rename successful: %s -> %s', src, dest)
        return exceptions

    exception_batches = util.run_using_threadpool(
        _rename_batch, zip(source_file_batch, destination_file_batch),
        num_threads)

    all_exceptions = [e for exception_batch in exception_batches
                      for e in exception_batch]
    if all_exceptions:
      raise Exception('Encountered exceptions in finalize_write: %s',
                      all_exceptions)

    for final_name in destination_files:
      yield final_name

    logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                 time.time() - start_time)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass
Esempio n. 5
0
  def finalize_write(self, init_result, writer_results,
                     unused_pre_finalize_results):
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)

    src_files = []
    dst_files = []
    delete_files = []
    chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
    num_skipped = 0
    for shard_num, shard in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      src = shard
      dst = final_name
      src_exists = FileSystems.exists(src)
      dst_exists = FileSystems.exists(dst)
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)

    num_skipped = len(delete_files)
    FileSystems.delete(delete_files)
    num_shards_to_finalize = len(src_files)
    min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_file_batch = [src_files[i:i + chunk_size]
                         for i in range(0, len(src_files), chunk_size)]
    destination_file_batch = [dst_files[i:i + chunk_size]
                              for i in range(0, len(dst_files), chunk_size)]

    if num_shards_to_finalize:
      logging.info(
          'Starting finalize_write threads with num_shards: %d (skipped: %d), '
          'batches: %d, num_threads: %d',
          num_shards_to_finalize, num_skipped, len(source_file_batch),
          num_threads)
      start_time = time.time()

      # Use a thread pool for renaming operations.
      def _rename_batch(batch):
        """_rename_batch executes batch rename operations."""
        source_files, destination_files = batch
        exceptions = []
        try:
          FileSystems.rename(source_files, destination_files)
          return exceptions
        except BeamIOError as exp:
          if exp.exception_details is None:
            raise
          for (src, dst), exception in exp.exception_details.iteritems():
            if exception:
              logging.error(('Exception in _rename_batch. src: %s, '
                             'dst: %s, err: %s'), src, dst, exception)
              exceptions.append(exception)
            else:
              logging.debug('Rename successful: %s -> %s', src, dst)
          return exceptions

      exception_batches = util.run_using_threadpool(
          _rename_batch, zip(source_file_batch, destination_file_batch),
          num_threads)

      all_exceptions = [e for exception_batch in exception_batches
                        for e in exception_batch]
      if all_exceptions:
        raise Exception(
            'Encountered exceptions in finalize_write: %s' % all_exceptions)

      for final_name in dst_files:
        yield final_name

      logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize,
                   time.time() - start_time)
    else:
      logging.warning(
          'No shards found to finalize. num_shards: %d, skipped: %d',
          num_shards, num_skipped)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass