def batch_convert_multi_files_to_single(input_path, output_folder, threads, recursive): pool = Pool(threads) file_list = get_fast5_file_list(input_path, recursive) pbar = get_progress_bar(len(file_list)) def update(results): output_file = os.path.basename(results.popleft()) with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table: for filename in results: output_table.write("{}\t{}\n".format(output_file, filename)) pbar.update(pbar.currval + 1) if not os.path.exists(output_folder): os.makedirs(output_folder) results_array = [] for batch_num, filename in enumerate(file_list): results_array.append( pool.apply_async(convert_multi_to_single, args=(filename, output_folder, str(batch_num)), callback=update)) pool.close() pool.join() pbar.finish()
def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size, threads, recursive, follow_symlinks, target_compression): pool = Pool(threads) file_list = get_fast5_file_list(input_path, recursive, follow_symlinks) pbar = get_progress_bar(int( (len(file_list) + batch_size - 1) / batch_size)) def update(result): output_file = result[1] with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table: for filename in result[0]: output_table.write("{}\t{}\n".format(filename, output_file)) pbar.update(pbar.currval + 1) results_array = [] os.makedirs(output_folder, exist_ok=True) for batch_num, batch in enumerate(batcher(file_list, batch_size)): output_file = os.path.join( output_folder, "{}_{}.fast5".format(filename_base, batch_num)) results_array.append( pool.apply_async(create_multi_read_file, args=(batch, output_file, target_compression), callback=update)) pool.close() pool.join() pbar.finish()
def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True): # We require an absolute input path to we can replicate the data structure relative to it later on input_folder = os.path.abspath(input_folder) file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks) if len(file_list) == 0: raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive)) # Set up the process pool and the progressbar pool = Pool(min(threads, len(file_list))) pbar = get_progress_bar(len(file_list)) def update(result): pbar.update(pbar.currval + 1) for input_file in file_list: input_path = os.path.join(input_folder, input_file) output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder)) pool.apply_async(func=compress_file, args=(input_path, output_path, target_compression), callback=update) # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join() pool.close() pool.join() pbar.finish()
def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size, threads, recursive): pool = Pool(threads) file_list = get_fast5_file_list(input_path, recursive) pbar = get_progress_bar(int((len(file_list)+batch_size-1)/batch_size)) def update(results): output_file = os.path.basename(results.popleft()) with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table: for filename in results: output_table.write("{}\t{}\n".format(filename, output_file)) pbar.update(pbar.currval + 1) if not os.path.exists(output_folder): os.makedirs(output_folder) with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table: output_table.write("single_read_file\tmulti_read_file\n") for batch_num, batch in enumerate(batcher(file_list, batch_size)): output_file = os.path.join(output_folder, "{}_{}.fast5".format(filename_base, batch_num)) pool.apply_async(create_multi_read_file, args=(batch, output_file), callback=update) pool.close() pool.join() pbar.finish()
def __init__(self, input_folder, output_folder, read_list_file, filename_base, batch_size=4000, threads=1, recursive=False, file_list_file=None): assert Path(input_folder).is_dir() assert Path(read_list_file).is_file() assert isinstance(filename_base, str) assert isinstance(batch_size, int) assert isinstance(threads, int) assert isinstance(recursive, bool) self.logger = logging.getLogger(self.__class__.__name__) self.read_set = get_filter_reads(read_list_file) self.input_f5s = get_fast5_file_list(str(input_folder), recursive) if file_list_file: file_set = get_filter_reads(file_list_file) for file in file_set: assert Path(file).exists( ), "{} from file list doesn't exist".format(file) self.input_f5s = list(file_set.intersection(self.input_f5s)) # determine max number of workers self.batch_size = batch_size num_outputs = int(ceil(len(self.read_set) / batch_size)) self.num_workers = min(threads, min(num_outputs, len(self.input_f5s))) out_basename = Path(output_folder) if not out_basename.exists(): out_basename.mkdir() self.filename_mapping_file = out_basename / "filename_mapping.txt" if self.filename_mapping_file.exists(): self.logger.info("overwriting filename mapping file {}".format( self.filename_mapping_file)) self.filename_mapping_file.unlink() # dict where key=filename value=read_set self.out_files = {} out_file_names = [] for i in range(num_outputs): filename = filename_base + str(i) + ".fast5" output_file_name = out_basename / filename if output_file_name.exists(): self.logger.info( "overwriting multiread file {}".format(output_file_name)) output_file_name.unlink() self.out_files[output_file_name] = set() out_file_names.append(output_file_name) # reversing so that first item to be popped is lower idx self.available_out_files = out_file_names[::-1] self.tasks = [] self.pool = None # progressbar total is number of reads in read_set plus number of input files # (to see progress while scanning files that don't have any relevant reads) self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
def __init__(self, input_folder, output_folder, read_list_file, filename_base, batch_size=4000, threads=1, recursive=False, file_list_file=None): assert path.isdir(input_folder) assert path.isfile(read_list_file) assert isinstance(filename_base, str) assert isinstance(batch_size, int) assert isinstance(threads, int) assert isinstance(recursive, bool) self.logger = logging.getLogger(self.__class__.__name__) self.read_set = get_filter_reads(read_list_file) self.input_f5s = get_fast5_file_list(str(input_folder), recursive) if len(self.read_set) < 1: raise ValueError( "No reads in read list file {}".format(read_list_file)) if len(self.input_f5s) < 1: raise ValueError( "No input fast5 files found in {}. Recursion is set to {}". format(str(input_folder), recursive)) if batch_size < 1: raise ValueError( "Batch size (--batch_size) must be a positive integer, not {}". format(batch_size)) if threads < 1: raise ValueError( "Max number of threads (--threads) must be a positive integer, not {}" .format(threads)) if file_list_file: file_set = get_filter_reads(file_list_file) for file in file_set: assert path.exists( file), "{} from file list doesn't exist".format(file) self.input_f5s = list(file_set.intersection(self.input_f5s)) # determine max number of workers self.batch_size = batch_size num_outputs = int(ceil(len(self.read_set) / float(batch_size))) self.num_workers = min(threads, min(num_outputs, len(self.input_f5s))) if not path.exists(output_folder): mkdir(output_folder) self.filename_mapping_file = path.join(output_folder, "filename_mapping.txt") if path.exists(self.filename_mapping_file): self.logger.info("overwriting filename mapping file {}".format( self.filename_mapping_file)) unlink(self.filename_mapping_file) # dict where key=filename value=read_set self.out_files = {} out_file_names = [] for i in range(num_outputs): filename = filename_base + str(i) + ".fast5" output_file_name = path.join(output_folder, filename) if path.exists(output_file_name): self.logger.info( "overwriting multiread file {}".format(output_file_name)) unlink(output_file_name) self.out_files[output_file_name] = set() out_file_names.append(output_file_name) # reversing so that first item to be popped is lower idx self.available_out_files = out_file_names[::-1] self.tasks = [] self.pool = None # progressbar total is number of reads in read_set plus number of input files # (to see progress while scanning files that don't have any relevant reads) self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))