def build_read_index(): logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) logger = logging.getLogger('Index Reads') parser = argparse.ArgumentParser( description='Build index of reads within .fast5s. Output to stdout.') parser.add_argument('input', help='.fast5 directory') parser.add_argument( '--recursive', action='store_true', help='Search recursively under `input` for source files.') parser.add_argument('--workers', type=int, default=8, help='Number of worker processes.') args = parser.parse_args() src_files = list( iterate_fast5(args.input, paths=True, recursive=args.recursive)) logger.info("Found {} files.".format(len(src_files))) with ProcessPoolExecutor(args.workers) as executor: n_reads = 0 for i, (src, read_ids) in enumerate( zip(src_files, executor.map(reads_in_multi, src_files, chunksize=10))): n_reads += len(read_ids) for read in read_ids: print('\t'.join((read, os.path.abspath(src)))) if i % 10 == 0: logger.info("Indexed {}/{} files. {} reads".format( i, len(src_files), n_reads))
def _fast5_filter(path, channels='all', recursive=False, limit=None, channel_limit=None): """Yield .fast5 filehandles, optionally filtered by channel. :param path: input path. :param channels: one of 'all', 'even', 'odd'. :param recursive: find .fast5 recursively below `path`. :param limit: maximum number of filehandles to yield. :param channel_limit: maximum number of filehandles to yield per channel. """ allowed_channels = ('all', 'even', 'odd') if channels not in allowed_channels: raise ValueError( "'channels' option should be one of {}.".format(allowed_channels)) def _odd_even_filter(base): for x in base: odd = bool(int(x.summary()['channel']) % 2) if (channels == 'odd' and odd) or (channels == 'even' and not odd): yield x def _chan_limit_filter(base): counters = Counter() for x in base: channel = int(x.summary()['channel']) if counters[channel] < channel_limit: counters[channel] += 1 yield x def _valid_file(base): for fname in base: try: fh = Fast5(fname) except Exception as e: logger.warn('Could not open {}.'.format(fname)) else: yield fh gen = _valid_file(iterate_fast5(path, paths=True, recursive=recursive)) if channels != 'all': gen = _odd_even_filter(gen) if channel_limit is not None: gen = _chan_limit_filter(gen) if limit is not None: gen = itertools.islice(gen, limit) yield from gen
def filter_multi_reads(): logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) logger = logging.getLogger('Filter') parser = argparse.ArgumentParser( description='Extract reads from multi-read .fast5 files.') parser.add_argument( 'input', help='Path to input multi-read .fast5 files (or list of files).') parser.add_argument('output', help='Output folder.') parser.add_argument( 'filter', help='A .tsv file with column `read_id` defining required reads. ' 'If a `filename` column is present, this will be used as the ' 'location of the read.') parser.add_argument( '--tsv_field', default='read_id', help='Field name from `filter` file to obtain read IDs.') parser.add_argument('--prefix', default="", help='Read file prefix.') parser.add_argument( '--recursive', action='store_true', help='Search recursively under `input` for source files.') parser.add_argument('--workers', type=int, default=4, help='Number of worker processes.') out_format = parser.add_mutually_exclusive_group() out_format.add_argument('--multi', action='store_true', default=True, help='Output multi-read files.') out_format.add_argument('--single', action='store_false', dest='multi', help='Output single-read files.') #parser.add_argument('--limit', type=int, default=None, help='Limit reads per channel.') args = parser.parse_args() if not args.multi: raise NotImplementedError( 'Extraction of reads to single read files is on the TODO list.') if not os.path.exists(args.output): os.makedirs(args.output) else: raise IOError('The output directory must not exist.') # grab list of source files logger.info("Searching for input files.") try: src_files = list(set(readtsv(args.input)['filename'])) except Exception as e: logger.info( 'Failed to read `input` as filelist, assuming path to search. {}'. format(e)) src_files = list( iterate_fast5(args.input, paths=True, recursive=args.recursive)) n_files = len(src_files) logger.info("Found {} source files.".format(n_files)) logger.info("Reading filter file.") read_table = readtsv(args.filter, fields=[args.tsv_field]) logger.info("Found {} reads in filter.".format(len(read_table))) try: # try to build index from the filter file with 'filename' column if 'filename' not in read_table.dtype.names: raise ValueError("'filename' column not present in filter.") logger.info("Attempting to build read index from input filter.") src_path_files = {os.path.basename(x): x for x in src_files} if len(src_path_files) != len(src_files): raise ValueError('Found non-uniquely named source files') read_index = dict() for fname, indices in group_vector(read_table['filename']).items(): fpath = src_path_files[os.path.basename(fname)] read_index[fpath] = read_table[args.tsv_field][indices] logger.info("Successfully build read index from input filter.") except Exception as e: logger.info("Failed to build read index from summary: {}".format(e)) read_index = None required_reads = set(read_table[args.tsv_field]) logger.info("Finding reads within {} source files.".format(n_files)) index_worker = functools.partial(reads_in_multi, filt=required_reads) read_index = dict() n_reads = 0 with ProcessPoolExecutor(args.workers) as executor: i = 0 for src_file, read_ids in zip( src_files, executor.map(index_worker, src_files, chunksize=10)): i += 1 n_reads += len(read_ids) read_index[src_file] = read_ids if i % 10 == 0: logger.info("Indexed {}/{} files. {}/{} reads".format( i, n_files, n_reads, len(required_reads))) n_reads = sum(len(x) for x in read_index.values()) # We don't go via creating Read objects, copying the data verbatim # likely quicker and nothing should need the verification that the APIs # provide (garbage in, garbage out). logger.info("Extracting {} reads.".format(n_reads)) if args.prefix != '': args.prefix = '{}_'.format(args.prefix) with ProcessPoolExecutor(args.workers) as executor: reads_per_process = np.ceil(n_reads / args.workers) proc_n_reads = 0 proc_reads = dict() job = 0 futures = list() for src in read_index.keys(): proc_reads[src] = read_index[src] proc_n_reads += len(proc_reads[src]) if proc_n_reads > reads_per_process: proc_prefix = "{}{}_".format(args.prefix, job) futures.append( executor.submit(_subset_reads_to_file, proc_reads, args.output, proc_prefix, worker_id=job)) job += 1 proc_n_reads = 0 proc_reads = dict() if proc_n_reads > 0: # processing remaining reads proc_prefix = "{}{}_".format(args.prefix, job) futures.append( executor.submit(_subset_reads_to_file, proc_reads, args.output, proc_prefix, worker_id=job)) for fut in as_completed(futures): try: reads_written, prefix = fut.result() logger.info("Written {} reads to {}.".format( reads_written, prefix)) except Exception as e: logger.warning("Error: {}".format(e)) logger.info("Done.")