def finalize_write(self, init_result, writer_results, pre_finalize_result): file_path_prefix = self.file_path_prefix.get() shard_paths = it.chain.from_iterable(writer_results) path_pairs = list(self._source_dest_shard_pairs(shard_paths)) unique_dest_dirs = {pp.split(pair[1])[0] for pair in path_pairs} num_shards = len(path_pairs) min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) batch_size = FileSystems.get_chunk_size(file_path_prefix) batches = [ path_pairs[i:i + batch_size] for i in six.moves.range(0, len(path_pairs), batch_size) ] logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(batches), num_threads) start_time = time.time() if unique_dest_dirs: # Fix #18 run_using_threadpool raises if you pass in an empty list of inputs # so if we don't have any work to do, then just skip it util.run_using_threadpool(self._create_output_dir, unique_dest_dirs, num_threads) exception_batches = util.run_using_threadpool( self._rename_batch, batches, num_threads) all_exceptions = [ e for exception_batch in exception_batches for e in exception_batch ] if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s', all_exceptions) for _, final_name in path_pairs: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def _estimate_sizes_of_files(file_names, pattern=None): """Returns the size of all the files as an ordered list based on the file names that are provided here. If the pattern is specified here then we use the size_of_files_in_glob method to get the size of files matching the glob for performance improvements instead of getting the size one by one. """ if not file_names: return [] elif len(file_names) == 1: return [fileio.ChannelFactory.size_in_bytes(file_names[0])] else: if pattern is None: return util.run_using_threadpool( fileio.ChannelFactory.size_in_bytes, file_names, MAX_NUM_THREADS_FOR_SIZE_ESTIMATION) else: file_sizes = fileio.ChannelFactory.size_of_files_in_glob(pattern, file_names) return [file_sizes[f] for f in file_names]
def finalize_write(self, init_result, writer_results, unused_pre_finalize_results): writer_results = sorted(writer_results) num_shards = len(writer_results) src_files, dst_files, delete_files, num_skipped = ( self._check_state_for_finalize_write(writer_results, num_shards)) num_skipped += len(delete_files) FileSystems.delete(delete_files) num_shards_to_finalize = len(src_files) min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get()) source_file_batch = [ src_files[i:i + chunk_size] for i in range(0, len(src_files), chunk_size) ] destination_file_batch = [ dst_files[i:i + chunk_size] for i in range(0, len(dst_files), chunk_size) ] if num_shards_to_finalize: logging.info( 'Starting finalize_write threads with num_shards: %d (skipped: %d), ' 'batches: %d, num_threads: %d', num_shards_to_finalize, num_skipped, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dst), exception in iteritems(exp.exception_details): if exception: logging.error( ('Exception in _rename_batch. src: %s, ' 'dst: %s, err: %s'), src, dst, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dst) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, list(zip(source_file_batch, destination_file_batch)), num_threads) all_exceptions = [ e for exception_batch in exception_batches for e in exception_batch ] if all_exceptions: raise Exception( 'Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in dst_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize, time.time() - start_time) else: logging.warning( 'No shards found to finalize. num_shards: %d, skipped: %d', num_shards, num_skipped) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results): file_path_prefix = self.file_path_prefix.get() file_name_suffix = self.file_name_suffix.get() writer_results = sorted(writer_results) num_shards = len(writer_results) min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_files = [] destination_files = [] chunk_size = FileSystems.get_chunk_size(file_path_prefix) for shard_num, shard in enumerate(writer_results): final_name = ''.join([ file_path_prefix, self.shard_name_format % dict(shard_num=shard_num, num_shards=num_shards), file_name_suffix ]) source_files.append(shard) destination_files.append(final_name) source_file_batch = [ source_files[i:i + chunk_size] for i in range(0, len(source_files), chunk_size) ] destination_file_batch = [ destination_files[i:i + chunk_size] for i in range(0, len(destination_files), chunk_size) ] logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dest), exception in exp.exception_details.iteritems(): if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if FileSystems.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning( 'Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning( ('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [ e for exception_batch in exception_batches for e in exception_batch ] if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in destination_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results): writer_results = sorted(writer_results) num_shards = len(writer_results) min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) rename_ops = [] for shard_num, shard in enumerate(writer_results): final_name = ''.join([ self.file_path_prefix, self.shard_name_format % dict(shard_num=shard_num, num_shards=num_shards), self.file_name_suffix ]) rename_ops.append((shard, final_name)) batches = [] current_batch = [] for rename_op in rename_ops: current_batch.append(rename_op) if len(current_batch) == MAX_BATCH_OPERATION_SIZE: batches.append(current_batch) current_batch = [] if current_batch: batches.append(current_batch) logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(batches), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" exceptions = [] exception_infos = ChannelFactory.rename_batch(batch) for src, dest, exception in exception_infos: if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if ChannelFactory.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning( 'Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning( ('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions exception_batches = util.run_using_threadpool(_rename_batch, batches, num_threads) all_exceptions = [] for exceptions in exception_batches: if exceptions: all_exceptions += exceptions if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s', all_exceptions) for shard, final_name in rename_ops: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: ChannelFactory.rmdir(init_result) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results): file_path_prefix = self.file_path_prefix.get() file_name_suffix = self.file_name_suffix.get() writer_results = sorted(writer_results) num_shards = len(writer_results) min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_files = [] destination_files = [] chunk_size = FileSystems.get_chunk_size(file_path_prefix) for shard_num, shard in enumerate(writer_results): final_name = ''.join([ file_path_prefix, self.shard_name_format % dict( shard_num=shard_num, num_shards=num_shards), file_name_suffix ]) source_files.append(shard) destination_files.append(final_name) source_file_batch = [source_files[i:i + chunk_size] for i in xrange(0, len(source_files), chunk_size)] destination_file_batch = [destination_files[i:i + chunk_size] for i in xrange(0, len(destination_files), chunk_size)] logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dest), exception in exp.exception_details.iteritems(): if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if FileSystems.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning('Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning(('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [e for exception_batch in exception_batches for e in exception_batch] if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s', all_exceptions) for final_name in destination_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results, unused_pre_finalize_results): writer_results = sorted(writer_results) num_shards = len(writer_results) src_files = [] dst_files = [] delete_files = [] chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get()) num_skipped = 0 for shard_num, shard in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) src = shard dst = final_name src_exists = FileSystems.exists(src) dst_exists = FileSystems.exists(dst) if not src_exists and not dst_exists: raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % ( src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) num_skipped = len(delete_files) FileSystems.delete(delete_files) num_shards_to_finalize = len(src_files) min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) source_file_batch = [src_files[i:i + chunk_size] for i in range(0, len(src_files), chunk_size)] destination_file_batch = [dst_files[i:i + chunk_size] for i in range(0, len(dst_files), chunk_size)] if num_shards_to_finalize: logging.info( 'Starting finalize_write threads with num_shards: %d (skipped: %d), ' 'batches: %d, num_threads: %d', num_shards_to_finalize, num_skipped, len(source_file_batch), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] try: FileSystems.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dst), exception in exp.exception_details.iteritems(): if exception: logging.error(('Exception in _rename_batch. src: %s, ' 'dst: %s, err: %s'), src, dst, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dst) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, zip(source_file_batch, destination_file_batch), num_threads) all_exceptions = [e for exception_batch in exception_batches for e in exception_batch] if all_exceptions: raise Exception( 'Encountered exceptions in finalize_write: %s' % all_exceptions) for final_name in dst_files: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize, time.time() - start_time) else: logging.warning( 'No shards found to finalize. num_shards: %d, skipped: %d', num_shards, num_skipped) try: FileSystems.delete([init_result]) except IOError: # May have already been removed. pass
def finalize_write(self, init_result, writer_results): writer_results = sorted(writer_results) num_shards = len(writer_results) min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS) num_threads = max(1, min_threads) rename_ops = [] for shard_num, shard in enumerate(writer_results): final_name = ''.join([ self.file_path_prefix, self.shard_name_format % dict( shard_num=shard_num, num_shards=num_shards), self.file_name_suffix ]) rename_ops.append((shard, final_name)) batches = [] current_batch = [] for rename_op in rename_ops: current_batch.append(rename_op) if len(current_batch) == MAX_BATCH_OPERATION_SIZE: batches.append(current_batch) current_batch = [] if current_batch: batches.append(current_batch) logging.info( 'Starting finalize_write threads with num_shards: %d, ' 'batches: %d, num_threads: %d', num_shards, len(batches), num_threads) start_time = time.time() # Use a thread pool for renaming operations. def _rename_batch(batch): """_rename_batch executes batch rename operations.""" exceptions = [] exception_infos = ChannelFactory.rename_batch(batch) for src, dest, exception in exception_infos: if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if ChannelFactory.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning('Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning(('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions exception_batches = util.run_using_threadpool( _rename_batch, batches, num_threads) all_exceptions = [] for exceptions in exception_batches: if exceptions: all_exceptions += exceptions if all_exceptions: raise Exception('Encountered exceptions in finalize_write: %s', all_exceptions) for shard, final_name in rename_ops: yield final_name logging.info('Renamed %d shards in %.2f seconds.', num_shards, time.time() - start_time) try: ChannelFactory.rmdir(init_result) except IOError: # May have already been removed. pass