Ejemplo n.º 1
0
def build_read_index():
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger('Index Reads')

    parser = argparse.ArgumentParser(
        description='Build index of reads within .fast5s. Output to stdout.')
    parser.add_argument('input', help='.fast5 directory')
    parser.add_argument(
        '--recursive',
        action='store_true',
        help='Search recursively under `input` for source files.')
    parser.add_argument('--workers',
                        type=int,
                        default=8,
                        help='Number of worker processes.')
    args = parser.parse_args()

    src_files = list(
        iterate_fast5(args.input, paths=True, recursive=args.recursive))
    logger.info("Found {} files.".format(len(src_files)))

    with ProcessPoolExecutor(args.workers) as executor:
        n_reads = 0
        for i, (src, read_ids) in enumerate(
                zip(src_files,
                    executor.map(reads_in_multi, src_files, chunksize=10))):
            n_reads += len(read_ids)
            for read in read_ids:
                print('\t'.join((read, os.path.abspath(src))))
            if i % 10 == 0:
                logger.info("Indexed {}/{} files. {} reads".format(
                    i, len(src_files), n_reads))
Ejemplo n.º 2
0
def _fast5_filter(path,
                  channels='all',
                  recursive=False,
                  limit=None,
                  channel_limit=None):
    """Yield .fast5 filehandles, optionally filtered by channel.

    :param path: input path.
    :param channels: one of 'all', 'even', 'odd'.
    :param recursive: find .fast5 recursively below `path`.
    :param limit: maximum number of filehandles to yield.
    :param channel_limit: maximum number of filehandles to yield per channel.
    """
    allowed_channels = ('all', 'even', 'odd')
    if channels not in allowed_channels:
        raise ValueError(
            "'channels' option should be one of {}.".format(allowed_channels))

    def _odd_even_filter(base):
        for x in base:
            odd = bool(int(x.summary()['channel']) % 2)
            if (channels == 'odd' and odd) or (channels == 'even' and not odd):
                yield x

    def _chan_limit_filter(base):
        counters = Counter()
        for x in base:
            channel = int(x.summary()['channel'])
            if counters[channel] < channel_limit:
                counters[channel] += 1
                yield x

    def _valid_file(base):
        for fname in base:
            try:
                fh = Fast5(fname)
            except Exception as e:
                logger.warn('Could not open {}.'.format(fname))
            else:
                yield fh

    gen = _valid_file(iterate_fast5(path, paths=True, recursive=recursive))
    if channels != 'all':
        gen = _odd_even_filter(gen)
    if channel_limit is not None:
        gen = _chan_limit_filter(gen)
    if limit is not None:
        gen = itertools.islice(gen, limit)
    yield from gen
Ejemplo n.º 3
0
def filter_multi_reads():
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger('Filter')
    parser = argparse.ArgumentParser(
        description='Extract reads from multi-read .fast5 files.')
    parser.add_argument(
        'input',
        help='Path to input multi-read .fast5 files (or list of files).')
    parser.add_argument('output', help='Output folder.')
    parser.add_argument(
        'filter',
        help='A .tsv file with column `read_id` defining required reads. '
        'If a `filename` column is present, this will be used as the '
        'location of the read.')
    parser.add_argument(
        '--tsv_field',
        default='read_id',
        help='Field name from `filter` file to obtain read IDs.')
    parser.add_argument('--prefix', default="", help='Read file prefix.')
    parser.add_argument(
        '--recursive',
        action='store_true',
        help='Search recursively under `input` for source files.')
    parser.add_argument('--workers',
                        type=int,
                        default=4,
                        help='Number of worker processes.')

    out_format = parser.add_mutually_exclusive_group()
    out_format.add_argument('--multi',
                            action='store_true',
                            default=True,
                            help='Output multi-read files.')
    out_format.add_argument('--single',
                            action='store_false',
                            dest='multi',
                            help='Output single-read files.')

    #parser.add_argument('--limit', type=int, default=None, help='Limit reads per channel.')
    args = parser.parse_args()

    if not args.multi:
        raise NotImplementedError(
            'Extraction of reads to single read files is on the TODO list.')

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    else:
        raise IOError('The output directory must not exist.')

    # grab list of source files
    logger.info("Searching for input files.")
    try:
        src_files = list(set(readtsv(args.input)['filename']))
    except Exception as e:
        logger.info(
            'Failed to read `input` as filelist, assuming path to search. {}'.
            format(e))
        src_files = list(
            iterate_fast5(args.input, paths=True, recursive=args.recursive))
    n_files = len(src_files)
    logger.info("Found {} source files.".format(n_files))

    logger.info("Reading filter file.")
    read_table = readtsv(args.filter, fields=[args.tsv_field])
    logger.info("Found {} reads in filter.".format(len(read_table)))

    try:
        # try to build index from the filter file with 'filename' column
        if 'filename' not in read_table.dtype.names:
            raise ValueError("'filename' column not present in filter.")
        logger.info("Attempting to build read index from input filter.")
        src_path_files = {os.path.basename(x): x for x in src_files}
        if len(src_path_files) != len(src_files):
            raise ValueError('Found non-uniquely named source files')
        read_index = dict()
        for fname, indices in group_vector(read_table['filename']).items():
            fpath = src_path_files[os.path.basename(fname)]
            read_index[fpath] = read_table[args.tsv_field][indices]
        logger.info("Successfully build read index from input filter.")
    except Exception as e:
        logger.info("Failed to build read index from summary: {}".format(e))
        read_index = None
        required_reads = set(read_table[args.tsv_field])
        logger.info("Finding reads within {} source files.".format(n_files))
        index_worker = functools.partial(reads_in_multi, filt=required_reads)
        read_index = dict()
        n_reads = 0
        with ProcessPoolExecutor(args.workers) as executor:
            i = 0
            for src_file, read_ids in zip(
                    src_files,
                    executor.map(index_worker, src_files, chunksize=10)):
                i += 1
                n_reads += len(read_ids)
                read_index[src_file] = read_ids
                if i % 10 == 0:
                    logger.info("Indexed {}/{} files. {}/{} reads".format(
                        i, n_files, n_reads, len(required_reads)))

    n_reads = sum(len(x) for x in read_index.values())
    # We don't go via creating Read objects, copying the data verbatim
    # likely quicker and nothing should need the verification that the APIs
    # provide (garbage in, garbage out).
    logger.info("Extracting {} reads.".format(n_reads))
    if args.prefix != '':
        args.prefix = '{}_'.format(args.prefix)

    with ProcessPoolExecutor(args.workers) as executor:
        reads_per_process = np.ceil(n_reads / args.workers)
        proc_n_reads = 0
        proc_reads = dict()
        job = 0
        futures = list()
        for src in read_index.keys():
            proc_reads[src] = read_index[src]
            proc_n_reads += len(proc_reads[src])
            if proc_n_reads > reads_per_process:
                proc_prefix = "{}{}_".format(args.prefix, job)
                futures.append(
                    executor.submit(_subset_reads_to_file,
                                    proc_reads,
                                    args.output,
                                    proc_prefix,
                                    worker_id=job))
                job += 1
                proc_n_reads = 0
                proc_reads = dict()
        if proc_n_reads > 0:  # processing remaining reads
            proc_prefix = "{}{}_".format(args.prefix, job)
            futures.append(
                executor.submit(_subset_reads_to_file,
                                proc_reads,
                                args.output,
                                proc_prefix,
                                worker_id=job))

        for fut in as_completed(futures):
            try:
                reads_written, prefix = fut.result()
                logger.info("Written {} reads to {}.".format(
                    reads_written, prefix))
            except Exception as e:
                logger.warning("Error: {}".format(e))
    logger.info("Done.")