def finalize_write(self, init_result, writer_results, pre_finalize_result):
        file_path_prefix = self.file_path_prefix.get()

        shard_paths = it.chain.from_iterable(writer_results)
        path_pairs = list(self._source_dest_shard_pairs(shard_paths))
        unique_dest_dirs = {pp.split(pair[1])[0] for pair in path_pairs}

        num_shards = len(path_pairs)
        min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        batch_size = FileSystems.get_chunk_size(file_path_prefix)
        batches = [
            path_pairs[i:i + batch_size]
            for i in six.moves.range(0, len(path_pairs), batch_size)
        ]

        logging.info(
            'Starting finalize_write threads with num_shards: %d, '
            'batches: %d, num_threads: %d', num_shards, len(batches),
            num_threads)
        start_time = time.time()

        if unique_dest_dirs:
            # Fix #18 run_using_threadpool raises if you pass in an empty list of inputs
            # so if we don't have any work to do, then just skip it
            util.run_using_threadpool(self._create_output_dir,
                                      unique_dest_dirs, num_threads)

            exception_batches = util.run_using_threadpool(
                self._rename_batch, batches, num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]

            if all_exceptions:
                raise Exception('Encountered exceptions in finalize_write: %s',
                                all_exceptions)

        for _, final_name in path_pairs:
            yield final_name

        logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                     time.time() - start_time)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass
Example #2
0
 def _estimate_sizes_of_files(file_names, pattern=None):
   """Returns the size of all the files as an ordered list based on the file
   names that are provided here. If the pattern is specified here then we use
   the size_of_files_in_glob method to get the size of files matching the glob
   for performance improvements instead of getting the size one by one.
   """
   if not file_names:
     return []
   elif len(file_names) == 1:
     return [fileio.ChannelFactory.size_in_bytes(file_names[0])]
   else:
     if pattern is None:
       return util.run_using_threadpool(
           fileio.ChannelFactory.size_in_bytes, file_names,
           MAX_NUM_THREADS_FOR_SIZE_ESTIMATION)
     else:
       file_sizes = fileio.ChannelFactory.size_of_files_in_glob(pattern,
                                                                file_names)
       return [file_sizes[f] for f in file_names]
Example #3
0
    def finalize_write(self, init_result, writer_results,
                       unused_pre_finalize_results):
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)

        src_files, dst_files, delete_files, num_skipped = (
            self._check_state_for_finalize_write(writer_results, num_shards))
        num_skipped += len(delete_files)
        FileSystems.delete(delete_files)
        num_shards_to_finalize = len(src_files)
        min_threads = min(num_shards_to_finalize,
                          FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
        source_file_batch = [
            src_files[i:i + chunk_size]
            for i in range(0, len(src_files), chunk_size)
        ]
        destination_file_batch = [
            dst_files[i:i + chunk_size]
            for i in range(0, len(dst_files), chunk_size)
        ]

        if num_shards_to_finalize:
            logging.info(
                'Starting finalize_write threads with num_shards: %d (skipped: %d), '
                'batches: %d, num_threads: %d', num_shards_to_finalize,
                num_skipped, len(source_file_batch), num_threads)
            start_time = time.time()

            # Use a thread pool for renaming operations.
            def _rename_batch(batch):
                """_rename_batch executes batch rename operations."""
                source_files, destination_files = batch
                exceptions = []
                try:
                    FileSystems.rename(source_files, destination_files)
                    return exceptions
                except BeamIOError as exp:
                    if exp.exception_details is None:
                        raise
                    for (src,
                         dst), exception in iteritems(exp.exception_details):
                        if exception:
                            logging.error(
                                ('Exception in _rename_batch. src: %s, '
                                 'dst: %s, err: %s'), src, dst, exception)
                            exceptions.append(exception)
                        else:
                            logging.debug('Rename successful: %s -> %s', src,
                                          dst)
                    return exceptions

            exception_batches = util.run_using_threadpool(
                _rename_batch,
                list(zip(source_file_batch, destination_file_batch)),
                num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]
            if all_exceptions:
                raise Exception(
                    'Encountered exceptions in finalize_write: %s' %
                    all_exceptions)

            for final_name in dst_files:
                yield final_name

            logging.info('Renamed %d shards in %.2f seconds.',
                         num_shards_to_finalize,
                         time.time() - start_time)
        else:
            logging.warning(
                'No shards found to finalize. num_shards: %d, skipped: %d',
                num_shards, num_skipped)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass
Example #4
0
    def finalize_write(self, init_result, writer_results):
        file_path_prefix = self.file_path_prefix.get()
        file_name_suffix = self.file_name_suffix.get()
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)
        min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        source_files = []
        destination_files = []
        chunk_size = FileSystems.get_chunk_size(file_path_prefix)
        for shard_num, shard in enumerate(writer_results):
            final_name = ''.join([
                file_path_prefix, self.shard_name_format %
                dict(shard_num=shard_num, num_shards=num_shards),
                file_name_suffix
            ])
            source_files.append(shard)
            destination_files.append(final_name)

        source_file_batch = [
            source_files[i:i + chunk_size]
            for i in range(0, len(source_files), chunk_size)
        ]
        destination_file_batch = [
            destination_files[i:i + chunk_size]
            for i in range(0, len(destination_files), chunk_size)
        ]

        logging.info(
            'Starting finalize_write threads with num_shards: %d, '
            'batches: %d, num_threads: %d', num_shards, len(source_file_batch),
            num_threads)
        start_time = time.time()

        # Use a thread pool for renaming operations.
        def _rename_batch(batch):
            """_rename_batch executes batch rename operations."""
            source_files, destination_files = batch
            exceptions = []
            try:
                FileSystems.rename(source_files, destination_files)
                return exceptions
            except BeamIOError as exp:
                if exp.exception_details is None:
                    raise
                for (src,
                     dest), exception in exp.exception_details.iteritems():
                    if exception:
                        logging.warning('Rename not successful: %s -> %s, %s',
                                        src, dest, exception)
                        should_report = True
                        if isinstance(exception, IOError):
                            # May have already been copied.
                            try:
                                if FileSystems.exists(dest):
                                    should_report = False
                            except Exception as exists_e:  # pylint: disable=broad-except
                                logging.warning(
                                    'Exception when checking if file %s exists: '
                                    '%s', dest, exists_e)
                        if should_report:
                            logging.warning(
                                ('Exception in _rename_batch. src: %s, '
                                 'dest: %s, err: %s'), src, dest, exception)
                            exceptions.append(exception)
                    else:
                        logging.debug('Rename successful: %s -> %s', src, dest)
                return exceptions

        exception_batches = util.run_using_threadpool(
            _rename_batch, zip(source_file_batch, destination_file_batch),
            num_threads)

        all_exceptions = [
            e for exception_batch in exception_batches for e in exception_batch
        ]
        if all_exceptions:
            raise Exception('Encountered exceptions in finalize_write: %s' %
                            all_exceptions)

        for final_name in destination_files:
            yield final_name

        logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                     time.time() - start_time)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass
Example #5
0
    def finalize_write(self, init_result, writer_results):
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)
        min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        rename_ops = []
        for shard_num, shard in enumerate(writer_results):
            final_name = ''.join([
                self.file_path_prefix, self.shard_name_format %
                dict(shard_num=shard_num, num_shards=num_shards),
                self.file_name_suffix
            ])
            rename_ops.append((shard, final_name))

        batches = []
        current_batch = []
        for rename_op in rename_ops:
            current_batch.append(rename_op)
            if len(current_batch) == MAX_BATCH_OPERATION_SIZE:
                batches.append(current_batch)
                current_batch = []
        if current_batch:
            batches.append(current_batch)

        logging.info(
            'Starting finalize_write threads with num_shards: %d, '
            'batches: %d, num_threads: %d', num_shards, len(batches),
            num_threads)
        start_time = time.time()

        # Use a thread pool for renaming operations.
        def _rename_batch(batch):
            """_rename_batch executes batch rename operations."""
            exceptions = []
            exception_infos = ChannelFactory.rename_batch(batch)
            for src, dest, exception in exception_infos:
                if exception:
                    logging.warning('Rename not successful: %s -> %s, %s', src,
                                    dest, exception)
                    should_report = True
                    if isinstance(exception, IOError):
                        # May have already been copied.
                        try:
                            if ChannelFactory.exists(dest):
                                should_report = False
                        except Exception as exists_e:  # pylint: disable=broad-except
                            logging.warning(
                                'Exception when checking if file %s exists: '
                                '%s', dest, exists_e)
                    if should_report:
                        logging.warning(
                            ('Exception in _rename_batch. src: %s, '
                             'dest: %s, err: %s'), src, dest, exception)
                        exceptions.append(exception)
                else:
                    logging.debug('Rename successful: %s -> %s', src, dest)
            return exceptions

        exception_batches = util.run_using_threadpool(_rename_batch, batches,
                                                      num_threads)

        all_exceptions = []
        for exceptions in exception_batches:
            if exceptions:
                all_exceptions += exceptions
        if all_exceptions:
            raise Exception('Encountered exceptions in finalize_write: %s',
                            all_exceptions)

        for shard, final_name in rename_ops:
            yield final_name

        logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                     time.time() - start_time)

        try:
            ChannelFactory.rmdir(init_result)
        except IOError:
            # May have already been removed.
            pass
Example #6
0
  def finalize_write(self, init_result, writer_results):
    file_path_prefix = self.file_path_prefix.get()
    file_name_suffix = self.file_name_suffix.get()
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)
    min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_files = []
    destination_files = []
    chunk_size = FileSystems.get_chunk_size(file_path_prefix)
    for shard_num, shard in enumerate(writer_results):
      final_name = ''.join([
          file_path_prefix, self.shard_name_format % dict(
              shard_num=shard_num, num_shards=num_shards), file_name_suffix
      ])
      source_files.append(shard)
      destination_files.append(final_name)

    source_file_batch = [source_files[i:i + chunk_size]
                         for i in xrange(0, len(source_files),
                                         chunk_size)]
    destination_file_batch = [destination_files[i:i + chunk_size]
                              for i in xrange(0, len(destination_files),
                                              chunk_size)]

    logging.info(
        'Starting finalize_write threads with num_shards: %d, '
        'batches: %d, num_threads: %d',
        num_shards, len(source_file_batch), num_threads)
    start_time = time.time()

    # Use a thread pool for renaming operations.
    def _rename_batch(batch):
      """_rename_batch executes batch rename operations."""
      source_files, destination_files = batch
      exceptions = []
      try:
        FileSystems.rename(source_files, destination_files)
        return exceptions
      except BeamIOError as exp:
        if exp.exception_details is None:
          raise
        for (src, dest), exception in exp.exception_details.iteritems():
          if exception:
            logging.warning('Rename not successful: %s -> %s, %s', src, dest,
                            exception)
            should_report = True
            if isinstance(exception, IOError):
              # May have already been copied.
              try:
                if FileSystems.exists(dest):
                  should_report = False
              except Exception as exists_e:  # pylint: disable=broad-except
                logging.warning('Exception when checking if file %s exists: '
                                '%s', dest, exists_e)
            if should_report:
              logging.warning(('Exception in _rename_batch. src: %s, '
                               'dest: %s, err: %s'), src, dest, exception)
              exceptions.append(exception)
          else:
            logging.debug('Rename successful: %s -> %s', src, dest)
        return exceptions

    exception_batches = util.run_using_threadpool(
        _rename_batch, zip(source_file_batch, destination_file_batch),
        num_threads)

    all_exceptions = [e for exception_batch in exception_batches
                      for e in exception_batch]
    if all_exceptions:
      raise Exception('Encountered exceptions in finalize_write: %s',
                      all_exceptions)

    for final_name in destination_files:
      yield final_name

    logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                 time.time() - start_time)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass
Example #7
0
  def finalize_write(self, init_result, writer_results,
                     unused_pre_finalize_results):
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)

    src_files = []
    dst_files = []
    delete_files = []
    chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
    num_skipped = 0
    for shard_num, shard in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      src = shard
      dst = final_name
      src_exists = FileSystems.exists(src)
      dst_exists = FileSystems.exists(dst)
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)

    num_skipped = len(delete_files)
    FileSystems.delete(delete_files)
    num_shards_to_finalize = len(src_files)
    min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_file_batch = [src_files[i:i + chunk_size]
                         for i in range(0, len(src_files), chunk_size)]
    destination_file_batch = [dst_files[i:i + chunk_size]
                              for i in range(0, len(dst_files), chunk_size)]

    if num_shards_to_finalize:
      logging.info(
          'Starting finalize_write threads with num_shards: %d (skipped: %d), '
          'batches: %d, num_threads: %d',
          num_shards_to_finalize, num_skipped, len(source_file_batch),
          num_threads)
      start_time = time.time()

      # Use a thread pool for renaming operations.
      def _rename_batch(batch):
        """_rename_batch executes batch rename operations."""
        source_files, destination_files = batch
        exceptions = []
        try:
          FileSystems.rename(source_files, destination_files)
          return exceptions
        except BeamIOError as exp:
          if exp.exception_details is None:
            raise
          for (src, dst), exception in exp.exception_details.iteritems():
            if exception:
              logging.error(('Exception in _rename_batch. src: %s, '
                             'dst: %s, err: %s'), src, dst, exception)
              exceptions.append(exception)
            else:
              logging.debug('Rename successful: %s -> %s', src, dst)
          return exceptions

      exception_batches = util.run_using_threadpool(
          _rename_batch, zip(source_file_batch, destination_file_batch),
          num_threads)

      all_exceptions = [e for exception_batch in exception_batches
                        for e in exception_batch]
      if all_exceptions:
        raise Exception(
            'Encountered exceptions in finalize_write: %s' % all_exceptions)

      for final_name in dst_files:
        yield final_name

      logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize,
                   time.time() - start_time)
    else:
      logging.warning(
          'No shards found to finalize. num_shards: %d, skipped: %d',
          num_shards, num_skipped)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass
Example #8
0
  def finalize_write(self, init_result, writer_results):
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)
    min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    rename_ops = []
    for shard_num, shard in enumerate(writer_results):
      final_name = ''.join([
          self.file_path_prefix, self.shard_name_format % dict(
              shard_num=shard_num, num_shards=num_shards), self.file_name_suffix
      ])
      rename_ops.append((shard, final_name))

    batches = []
    current_batch = []
    for rename_op in rename_ops:
      current_batch.append(rename_op)
      if len(current_batch) == MAX_BATCH_OPERATION_SIZE:
        batches.append(current_batch)
        current_batch = []
    if current_batch:
      batches.append(current_batch)

    logging.info(
        'Starting finalize_write threads with num_shards: %d, '
        'batches: %d, num_threads: %d',
        num_shards, len(batches), num_threads)
    start_time = time.time()

    # Use a thread pool for renaming operations.
    def _rename_batch(batch):
      """_rename_batch executes batch rename operations."""
      exceptions = []
      exception_infos = ChannelFactory.rename_batch(batch)
      for src, dest, exception in exception_infos:
        if exception:
          logging.warning('Rename not successful: %s -> %s, %s', src, dest,
                          exception)
          should_report = True
          if isinstance(exception, IOError):
            # May have already been copied.
            try:
              if ChannelFactory.exists(dest):
                should_report = False
            except Exception as exists_e:  # pylint: disable=broad-except
              logging.warning('Exception when checking if file %s exists: '
                              '%s', dest, exists_e)
          if should_report:
            logging.warning(('Exception in _rename_batch. src: %s, '
                             'dest: %s, err: %s'), src, dest, exception)
            exceptions.append(exception)
        else:
          logging.debug('Rename successful: %s -> %s', src, dest)
      return exceptions

    exception_batches = util.run_using_threadpool(
        _rename_batch, batches, num_threads)

    all_exceptions = []
    for exceptions in exception_batches:
      if exceptions:
        all_exceptions += exceptions
    if all_exceptions:
      raise Exception('Encountered exceptions in finalize_write: %s',
                      all_exceptions)

    for shard, final_name in rename_ops:
      yield final_name

    logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                 time.time() - start_time)

    try:
      ChannelFactory.rmdir(init_result)
    except IOError:
      # May have already been removed.
      pass