Example #1
0
def basecall(model,
             reads,
             chunksize=4000,
             overlap=500,
             batchsize=32,
             reverse=False):
    reads = (read_chunk for read in reads
             for read_chunk in split_read(read, chunksize *
                                          batchsize)[::-1 if reverse else 1])
    chunks = (((read, start, end),
               chunk(torch.from_numpy(read.signal[start:end]), chunksize,
                     overlap)) for (read, start, end) in reads)
    batches = ((k, compute_scores(model, batch, reverse=reverse))
               for k, batch in batchify(chunks, batchsize=batchsize))
    stitched = ((read,
                 stitch(x,
                        chunksize,
                        overlap,
                        end - start,
                        model.stride,
                        reverse=reverse))
                for ((read, start, end), x) in unbatchify(batches))
    transferred = thread_map(transfer, stitched, n_thread=1)

    return ((read, concat([part for k, part in parts]))
            for read, parts in groupby(transferred, lambda x: x[0]))
Example #2
0
def basecall(model,
             reads,
             chunksize=4000,
             overlap=100,
             batchsize=32,
             reverse=False):
    """
    Basecalls a set of reads.
    """
    chunks = thread_iter(
        ((read, 0, len(read.signal)),
         chunk(torch.from_numpy(read.signal), chunksize, overlap))
        for read in reads)

    batches = thread_iter(batchify(chunks, batchsize=batchsize))

    scores = thread_iter((read, compute_scores(model, batch, reverse=reverse))
                         for read, batch in batches)

    results = thread_iter(
        (read,
         stitch_results(scores, end -
                        start, chunksize, overlap, model.stride, reverse))
        for ((read, start, end), scores) in unbatchify(scores))

    return thread_iter(
        (read, apply_stride_to_moves(model, attrs)) for read, attrs in results)
Example #3
0
def main(args):

    sys.stderr.write("> loading model\n")

    model = load_model(
        args.model_directory,
        args.device,
        weights=int(args.weights),
        half=args.half,
        chunksize=args.chunksize,
        use_rt=args.cudart,
    )

    samples = 0
    num_reads = 0
    max_read_size = 4e6
    dtype = np.float16 if args.half else np.float32
    reader = PreprocessReader(args.reads_directory)
    writer = DecoderWriterPool(model,
                               beamsize=args.beamsize,
                               fastq=args.fastq,
                               reference=args.reference)

    t0 = time.perf_counter()
    sys.stderr.write("> calling\n")

    with writer, reader, torch.no_grad():

        while True:

            read = reader.queue.get()
            if read is None:
                break

            if len(read.signal) > max_read_size:
                sys.stderr.write("> skipping long read %s (%s samples)\n" %
                                 (read.read_id, len(read.signal)))
                continue

            num_reads += 1
            samples += len(read.signal)

            raw_data = torch.tensor(read.signal.astype(dtype))
            chunks = chunk(raw_data, args.chunksize, args.overlap)

            posteriors = model(chunks.to(args.device)).cpu().numpy()
            posteriors = stitch(posteriors, args.overlap // model.stride // 2)

            writer.queue.put((read, posteriors[:raw_data.shape[0]]))

    duration = time.perf_counter() - t0

    sys.stderr.write("> completed reads: %s\n" % num_reads)
    sys.stderr.write("> duration: %s\n" %
                     timedelta(seconds=np.round(duration)))
    sys.stderr.write("> samples per second %.1E\n" % (samples / duration))
    sys.stderr.write("> done\n")
Example #4
0
def basecall(model, reads, beamsize=5, chunksize=0, overlap=0, batchsize=1, qscores=False, reverse=None):
    """
    Basecalls a set of reads.
    """
    chunks = (
        (read, chunk(torch.tensor(read.signal), chunksize, overlap)) for read in reads
    )
    scores = unbatchify(
        (k, compute_scores(model, v)) for k, v in batchify(chunks, batchsize)
    )
    scores = (
        (read, {'scores': stitch(v, chunksize, overlap, len(read.signal), model.stride)}) for read, v in scores
    )
    decoder = partial(decode, decode=model.decode, beamsize=beamsize, qscores=qscores, stride=model.stride)
    basecalls = process_map(decoder, scores, n_proc=4)
    return basecalls
Example #5
0
def basecall(model,
             reads,
             aligner=None,
             beamsize=40,
             chunksize=4000,
             overlap=500,
             batchsize=32,
             qscores=False,
             reverse=False):
    """
    Basecalls a set of reads.
    """
    _decode = partial(decode_int8, seqdist=model.seqdist, beamsize=beamsize)
    reads = (read_chunk for read in reads
             for read_chunk in split_read(read)[::-1 if reverse else 1])
    chunks = (((read, start, end),
               chunk(torch.from_numpy(read.signal[start:end]), chunksize,
                     overlap)) for (read, start, end) in reads)
    batches = (
        (k, quantise_int8(compute_scores(model, batch, reverse=reverse)))
        for k, batch in thread_iter(batchify(chunks, batchsize=batchsize)))
    stitched = ((read,
                 stitch(x,
                        chunksize,
                        overlap,
                        end - start,
                        model.stride,
                        reverse=reverse))
                for ((read, start, end), x) in unbatchify(batches))

    transferred = thread_map(transfer, stitched, n_thread=1)
    basecalls = thread_map(_decode, transferred, n_thread=8)

    basecalls = ((read, ''.join(seq for k, seq in parts))
                 for read, parts in groupby(
                     basecalls, lambda x:
                     (x[0].parent if hasattr(x[0], 'parent') else x[0])))
    basecalls = ((read, {
        'sequence': seq,
        'qstring': '?' * len(seq) if qscores else '*',
        'mean_qscore': 0.0
    }) for read, seq in basecalls)

    if aligner: return align_map(aligner, basecalls)
    return basecalls
Example #6
0
def basecall(model, reads, aligner=None, beamsize=40, chunksize=4000, overlap=500, batchsize=32, qscores=False):
    """
    Basecalls a set of reads.
    """
    split_read_length=400000
    _stitch = partial(
        stitch,
        start=overlap // 2 // model.stride,
        end=(chunksize - overlap // 2) // model.stride,
    )
    _decode = partial(decode_int8, seqdist=model.seqdist, beamsize=beamsize)
    reads = (
        ((read, i), x) for read in reads
        for (i, x) in enumerate(torch.split(torch.from_numpy(read.signal), split_read_length))
    )
    chunks = (
        ((read, chunk(signal, chunksize, overlap, pad_start=True)) for (read, signal) in reads)
    )
    batches = (
        (read, quantise_int8(compute_scores(model, batch)))
        for read, batch in thread_iter(batchify(chunks, batchsize=batchsize))
    )
    stitched = ((read, _stitch(x)) for (read, x) in unbatchify(batches))
    transferred = thread_map(transfer, stitched, n_thread=1)
    basecalls = thread_map(_decode, transferred, n_thread=8)

    basecalls = (
        (read, ''.join(seq for k, seq in parts)) for read, parts in groupby(basecalls, lambda x: x[0][0])
    )
    basecalls = (
        (read, {'sequence': seq, 'qstring': '?' * len(seq) if qscores else '*', 'mean_qscore': 0.0})
        for read, seq in basecalls
    )

    if aligner: return align_map(aligner, basecalls)
    return basecalls
Example #7
0
def main(args):

    if args.save_ctc and not args.reference:
        sys.stderr.write("> a reference is needed to output ctc training data\n")
        exit(1)

    if args.save_ctc:
        args.overlap = 900
        args.chunksize = 3600

    sys.stderr.write("> loading model\n")

    model = load_model(
        args.model_directory, args.device, weights=int(args.weights),
        half=args.half, chunksize=args.chunksize, use_rt=args.cudart,
    )

    if args.reference:
        sys.stderr.write("> loading reference\n")
        aligner = Aligner(args.reference, preset='ont-map')
        if not aligner:
            sys.stderr.write("> failed to load/build index\n")
            sys.exit(1)
    else:
        aligner = None

    samples = 0
    num_reads = 0
    max_read_size = 4e6
    dtype = np.float16 if args.half else np.float32
    ctc_writer = CTCWriter(model, aligner)
    reader = PreprocessReader(args.reads_directory)
    writer = DecoderWriterPool(model, beamsize=args.beamsize, fastq=args.fastq, aligner=aligner)

    t0 = time.perf_counter()
    sys.stderr.write("> calling\n")

    with writer, ctc_writer, reader, torch.no_grad():

        while True:

            read = reader.queue.get()
            if read is None:
                break

            if len(read.signal) > max_read_size:
                sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal)))
                continue

            num_reads += 1
            samples += len(read.signal)

            raw_data = torch.tensor(read.signal.astype(dtype))
            chunks = chunk(raw_data, args.chunksize, args.overlap)

            posteriors_ = model(chunks.to(args.device)).cpu().numpy()
            posteriors = stitch(posteriors_, args.overlap // model.stride // 2)

            writer.queue.put((read, posteriors[:raw_data.shape[0]]))
            if args.save_ctc and len(raw_data) > args.chunksize:
                ctc_writer.queue.put((chunks.numpy(), posteriors_))

    duration = time.perf_counter() - t0

    sys.stderr.write("> completed reads: %s\n" % num_reads)
    sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration)))
    sys.stderr.write("> samples per second %.1E\n" % (samples / duration))
    sys.stderr.write("> done\n")
Example #8
0
def main(args):
    if args.save_ctc and not args.reference:
        sys.stderr.write(
            "> a reference is needed to output ctc training data\n")
        exit(1)

    if args.save_ctc:
        args.overlap = 900
        args.chunksize = 3600

    sys.stderr.write("> loading model\n")

    model = load_model(
        args.model_directory,
        args.device,
        weights=int(args.weights),
        half=args.half,
        chunksize=args.chunksize,
        use_rt=args.cudart,
    )

    if args.reference:
        sys.stderr.write("> loading reference\n")
        aligner = Aligner(args.reference, preset='ont-map')
        if not aligner:
            sys.stderr.write("> failed to load/build index\n")
            sys.exit(1)
        write_sam_header(aligner)
    else:
        aligner = None


#    with open(summary_file(), 'w') as summary:
#        write_summary_header(summary, alignment=aligner)

    samples = 0
    num_reads = 0
    max_read_size = 4e6
    read_ids = column_to_set(args.read_ids)
    dtype = np.float16 if args.half else np.float32
    reader = ProcessIterator(get_reads(args.reads_directory,
                                       read_ids=read_ids,
                                       skip=args.skip),
                             progress=True)
    writer = ProcessPool(DecoderWriter,
                         model=model,
                         aligner=aligner,
                         beamsize=args.beamsize,
                         fastq=args.fastq)
    ctc_writer = CTCWriter(model,
                           aligner,
                           min_coverage=args.ctc_min_coverage,
                           min_accuracy=args.ctc_min_accuracy)

    t0 = time.perf_counter()
    sys.stderr.write("> calling\n")

    with writer, ctc_writer, reader, torch.no_grad():

        while True:

            read = reader.queue.get()
            if read is None:
                break

            if len(read.signal) > max_read_size:
                sys.stderr.write("> skipping long read %s (%s samples)\n" %
                                 (read.read_id, len(read.signal)))
                continue

            num_reads += 1
            samples += len(read.signal)

            raw_data = torch.tensor(read.signal.astype(dtype))
            print('bonito: raw_data.shape: ', raw_data.shape)
            chunks = chunk(raw_data, args.chunksize, args.overlap)

            posteriors_ = model(chunks.to(args.device)).cpu().numpy()
            posteriors = stitch(posteriors_, args.overlap // model.stride // 2)
            if args.write_basecall:
                writer.queue.put((read, posteriors[:raw_data.shape[0]]))
            if args.save_ctc and len(raw_data) > args.chunksize:
                ctc_writer.queue.put((chunks.numpy(), posteriors_))
            print('bonito: posteriors.shape', posteriors.shape)
            posteriors.tofile(args.post_file)

    duration = time.perf_counter() - t0

    sys.stderr.write("> completed reads: %s\n" % num_reads)
    sys.stderr.write("> duration: %s\n" %
                     timedelta(seconds=np.round(duration)))
    sys.stderr.write("> samples per second %.1E\n" % (samples / duration))
    sys.stderr.write("> done\n")