def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size, threads, recursive, follow_symlinks, target_compression): pool = Pool(threads) file_list = get_fast5_file_list(input_path, recursive, follow_symlinks) pbar = get_progress_bar(int( (len(file_list) + batch_size - 1) / batch_size)) def update(result): output_file = result[1] with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table: for filename in result[0]: output_table.write("{}\t{}\n".format(filename, output_file)) pbar.update(pbar.currval + 1) results_array = [] os.makedirs(output_folder, exist_ok=True) for batch_num, batch in enumerate(batcher(file_list, batch_size)): output_file = os.path.join( output_folder, "{}_{}.fast5".format(filename_base, batch_num)) results_array.append( pool.apply_async(create_multi_read_file, args=(batch, output_file, target_compression), callback=update)) pool.close() pool.join() pbar.finish()
def batch_convert_multi_files_to_single(input_path, output_folder, threads, recursive): pool = Pool(threads) file_list = get_fast5_file_list(input_path, recursive) pbar = get_progress_bar(len(file_list)) def update(results): output_file = os.path.basename(results.popleft()) with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table: for filename in results: output_table.write("{}\t{}\n".format(output_file, filename)) pbar.update(pbar.currval + 1) if not os.path.exists(output_folder): os.makedirs(output_folder) results_array = [] for batch_num, filename in enumerate(file_list): results_array.append( pool.apply_async(convert_multi_to_single, args=(filename, output_folder, str(batch_num)), callback=update)) pool.close() pool.join() pbar.finish()
def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True): # We require an absolute input path to we can replicate the data structure relative to it later on input_folder = os.path.abspath(input_folder) file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks) if len(file_list) == 0: raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive)) # Set up the process pool and the progressbar pool = Pool(min(threads, len(file_list))) pbar = get_progress_bar(len(file_list)) def update(result): pbar.update(pbar.currval + 1) for input_file in file_list: input_path = os.path.join(input_folder, input_file) output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder)) pool.apply_async(func=compress_file, args=(input_path, output_path, target_compression), callback=update) # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join() pool.close() pool.join() pbar.finish()
def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size, threads, recursive): pool = Pool(threads) file_list = get_fast5_file_list(input_path, recursive) pbar = get_progress_bar(int((len(file_list)+batch_size-1)/batch_size)) def update(results): output_file = os.path.basename(results.popleft()) with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table: for filename in results: output_table.write("{}\t{}\n".format(filename, output_file)) pbar.update(pbar.currval + 1) if not os.path.exists(output_folder): os.makedirs(output_folder) with open(os.path.join(output_folder, "filename_mapping.txt"), 'w') as output_table: output_table.write("single_read_file\tmulti_read_file\n") for batch_num, batch in enumerate(batcher(file_list, batch_size)): output_file = os.path.join(output_folder, "{}_{}.fast5".format(filename_base, batch_num)) pool.apply_async(create_multi_read_file, args=(batch, output_file), callback=update) pool.close() pool.join() pbar.finish()
def hdf_to_sam(args): """Entry point for converting guppy methylcalled fast5s to sam.""" sys.stdout.write('\t'.join(('@HD', 'VN:1.5', 'SO:unsorted'))) sys.stdout.write('\n') for name, seq, _ in mappy.fastx_read(args.reference, read_comment=False): sys.stdout.write('@SQ\tSN:{}\tLN:{}\n'.format(name, len(seq))) fast5s = get_fast5_file_list(args.path, recursive=args.recursive) worker = functools.partial(hdf_to_sam_worker, args.reference) with ProcessPoolExecutor(max_workers=args.workers) as executor: for res in executor.map(worker, fast5s): for r in res: sys.stdout.write('{}\n'.format(r))
def _run(self): """Iterate over input files and stores results into internal queue.""" fast5s = get_fast5_file_list(self.path, recursive=self.recursive) self.total_files = len(fast5s) self.logger.info("Found {} files to process.".format(self.total_files)) with ProcessPoolExecutor(self.workers, max_workers=self.workers) as executor: for fname in fast5s: while True: if self.queue.qsize() < self.max_size: future = executor.submit(self.extractor, fname) future.add_done_callback(self._store) break else: time.sleep(1) self.queue.put(None)
def test_conversion_script_multi(self, mock_pbar): input_folder = os.path.join(test_data, 'multi_read') compress_batch(input_folder=input_folder, output_folder=self.save_path, target_compression=VBZ) count_files = 0 count_reads = 0 for out_file in get_fast5_file_list(self.save_path, recursive=True, follow_symlinks=True): count_files += 1 with get_fast5_file(out_file) as f5: self.assertTrue(isinstance(f5, MultiFast5File)) for read in f5.get_reads(): self.assertCompressed(read) count_reads += 1 self.assertEqual(1, count_files) self.assertEqual(4, count_reads)
def batch_reverter(input_path, output_folder, filename_base, batch_size, threads, recursive, keys=set( ('Raw', 'channel_id', 'context_tags', 'tracking_id'))): # make sure output dir doesn't exists if os.path.exists(output_folder): sys.stderr.write("Directory exists: %s\n" % output_folder) sys.exit(1) os.makedirs(output_folder) # get files to process - in revert order, since fail is typically before pass file_list = get_fast5_file_list(input_path, recursive) file_list = file_list[::-1] print("%s files to process..." % len(file_list)) fi, ri = 0, -1 for i, input_file in enumerate(file_list, 1): with MultiFast5File(input_file, 'r') as input_f5: for ri, read in enumerate(input_f5.get_read_ids(), ri + 1): if not ri % 100: sys.stderr.write(" %s %s %s %s \r" % (fi, ri, read, input_file)) if not ri % batch_size: output_f5 = MultiFast5File( os.path.join(output_folder, "%s_%s.fast5" % (filename_base, fi)), 'w') fi += 1 # copy group to new file read_name = "read_" + read group = input_f5.handle[read_name] output_f5.handle.copy(group, read_name) # and remove additional info reverted_group = output_f5.handle[ read_name] #; print(reverted_group.keys()) for k in reverted_group.keys(): if k not in keys: del reverted_group[k]
def __init__(self, input_folder, output_folder, read_list_file, filename_base, batch_size=4000, threads=1, recursive=False, file_list_file=None): assert Path(input_folder).is_dir() assert Path(read_list_file).is_file() assert isinstance(filename_base, str) assert isinstance(batch_size, int) assert isinstance(threads, int) assert isinstance(recursive, bool) self.logger = logging.getLogger(self.__class__.__name__) self.read_set = get_filter_reads(read_list_file) self.input_f5s = get_fast5_file_list(str(input_folder), recursive) if file_list_file: file_set = get_filter_reads(file_list_file) for file in file_set: assert Path(file).exists( ), "{} from file list doesn't exist".format(file) self.input_f5s = list(file_set.intersection(self.input_f5s)) # determine max number of workers self.batch_size = batch_size num_outputs = int(ceil(len(self.read_set) / batch_size)) self.num_workers = min(threads, min(num_outputs, len(self.input_f5s))) out_basename = Path(output_folder) if not out_basename.exists(): out_basename.mkdir() self.filename_mapping_file = out_basename / "filename_mapping.txt" if self.filename_mapping_file.exists(): self.logger.info("overwriting filename mapping file {}".format( self.filename_mapping_file)) self.filename_mapping_file.unlink() # dict where key=filename value=read_set self.out_files = {} out_file_names = [] for i in range(num_outputs): filename = filename_base + str(i) + ".fast5" output_file_name = out_basename / filename if output_file_name.exists(): self.logger.info( "overwriting multiread file {}".format(output_file_name)) output_file_name.unlink() self.out_files[output_file_name] = set() out_file_names.append(output_file_name) # reversing so that first item to be popped is lower idx self.available_out_files = out_file_names[::-1] self.tasks = [] self.pool = None # progressbar total is number of reads in read_set plus number of input files # (to see progress while scanning files that don't have any relevant reads) self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
def __init__(self, input_folder, output_folder, read_list_file, filename_base, batch_size=4000, threads=1, recursive=False, file_list_file=None): assert path.isdir(input_folder) assert path.isfile(read_list_file) assert isinstance(filename_base, str) assert isinstance(batch_size, int) assert isinstance(threads, int) assert isinstance(recursive, bool) self.logger = logging.getLogger(self.__class__.__name__) self.read_set = get_filter_reads(read_list_file) self.input_f5s = get_fast5_file_list(str(input_folder), recursive) if len(self.read_set) < 1: raise ValueError( "No reads in read list file {}".format(read_list_file)) if len(self.input_f5s) < 1: raise ValueError( "No input fast5 files found in {}. Recursion is set to {}". format(str(input_folder), recursive)) if batch_size < 1: raise ValueError( "Batch size (--batch_size) must be a positive integer, not {}". format(batch_size)) if threads < 1: raise ValueError( "Max number of threads (--threads) must be a positive integer, not {}" .format(threads)) if file_list_file: file_set = get_filter_reads(file_list_file) for file in file_set: assert path.exists( file), "{} from file list doesn't exist".format(file) self.input_f5s = list(file_set.intersection(self.input_f5s)) # determine max number of workers self.batch_size = batch_size num_outputs = int(ceil(len(self.read_set) / float(batch_size))) self.num_workers = min(threads, min(num_outputs, len(self.input_f5s))) if not path.exists(output_folder): mkdir(output_folder) self.filename_mapping_file = path.join(output_folder, "filename_mapping.txt") if path.exists(self.filename_mapping_file): self.logger.info("overwriting filename mapping file {}".format( self.filename_mapping_file)) unlink(self.filename_mapping_file) # dict where key=filename value=read_set self.out_files = {} out_file_names = [] for i in range(num_outputs): filename = filename_base + str(i) + ".fast5" output_file_name = path.join(output_folder, filename) if path.exists(output_file_name): self.logger.info( "overwriting multiread file {}".format(output_file_name)) unlink(output_file_name) self.out_files[output_file_name] = set() out_file_names.append(output_file_name) # reversing so that first item to be popped is lower idx self.available_out_files = out_file_names[::-1] self.tasks = [] self.pool = None # progressbar total is number of reads in read_set plus number of input files # (to see progress while scanning files that don't have any relevant reads) self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
def setUp(self): self.files = get_fast5_file_list(self.read_dir, recursive=False)