Ejemplo n.º 1
0
 def test_strand_list_no_read_id_multiread(self):
     """See if reads ids found iterating through strand list containing
     filenames, not read ids"""
     strand_list = os.path.join(self.STRAND_LIST_DIR,
                                "strand_list_no_read_id.txt")
     self._check_found_read_ids(
         iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=strand_list))
Ejemplo n.º 2
0
 def test_strand_list_no_filename_single_reads(self):
     """See if read ids found iterating through single-read files in
     directory with strand list"""
     strand_list = os.path.join(self.STRAND_LIST_DIR,
                                "strand_list_no_filename.txt")
     self._check_found_read_ids(
         iterate_fast5_reads(self.READ_DIR, strand_list=strand_list))
Ejemplo n.º 3
0
def main():
    args = get_parser().parse_args()

    worker_kwarg_names = ['back_prob', 'localpen', 'minscore', 'trim']

    model = helpers.load_model(args.model)

    fast5_reads = fast5utils.iterate_fast5_reads(
        args.read_dir, limit=args.limit, strand_list=args.input_strand_list,
        recursive=args.recursive)

    with helpers.open_file_or_stdout(args.output) as fh:
        for res in imap_mp(
                squiggle_match.worker, fast5_reads, threads=args.jobs,
                fix_kwargs=helpers.get_kwargs(args, worker_kwarg_names),
                unordered=True, init=squiggle_match.init_worker,
                initargs=[model, args.references]):
            if res is None:
                continue
            read_id, sig, score, path, squiggle, bases = res
            bases = bases.decode('ascii')
            fh.write('#{} {}\n'.format(read_id, score))
            for i, (s, p) in enumerate(zip(sig, path)):
                fh.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                    read_id, i, s, p, bases[p], squiggle[p, 0], squiggle[p, 1],
                    squiggle[p, 2]))
Ejemplo n.º 4
0
def main(argv):
    """Main function to process mapping for each read using functions in prepare_mapping_funcs"""
    args = parser.parse_args()
    print("Running prepare_mapping using flip-flop remapping")

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    # Make an iterator that yields all the reads we're interested in.
    fast5_reads = fast5utils.iterate_fast5_reads(
        args.input_folder,
        limit=args.limit,
        strand_list=args.input_strand_list)

    # Set up arguments (kwargs) for the worker function for each read
    kwargs = helpers.get_kwargs(args,
                                ['alphabet', 'collapse_alphabet', 'device'])
    kwargs[
        'per_read_params_dict'] = prepare_mapping_funcs.get_per_read_params_dict_from_tsv(
            args.input_per_read_params)
    kwargs['references'] = helpers.fasta_file_to_dict(args.references)
    kwargs['model'] = helpers.load_model(args.model)
    workerFunction = prepare_mapping_funcs.oneread_remap  # remaps a single read using flip-flip network

    results = imap_mp(workerFunction,
                      fast5_reads,
                      threads=args.jobs,
                      fix_kwargs=kwargs,
                      unordered=True)

    # results is an iterable of dicts
    # each dict is a set of return values from a single read
    prepare_mapping_funcs.generate_output_from_results(results, args)
Ejemplo n.º 5
0
 def test_strand_list_invalid(self):
     """Use strand list with no header line. Should throw an exception."""
     strand_list = os.path.join(self.STRAND_LIST_DIR,
                                "invalid_strand_list_no_header.txt")
     with self.assertRaises(Exception):
         for fn, rid in iterate_fast5_reads(self.MULTIREAD_DIR,
                                            strand_list=strand_list):
             print("Filename=", fn, "read_id=", rid)
Ejemplo n.º 6
0
def main():
    args = parser.parse_args()

    trim_start, trim_end = args.trim

    fast5_reads = fast5utils.iterate_fast5_reads(
        args.input_folder,
        limit=args.limit,
        strand_list=args.input_strand_list,
        recursive=args.recursive)

    with open_file_or_stdout(args.output) as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
        # UUID is 32hexdigits and four dashes eg. '43f6a05c-0856-4edc-8cd2-4866d9d60eaa'
        writer.writerow(['UUID', 'trim_start', 'trim_end', 'shift', 'scale'])

        results = imap_mp(one_read_shift_scale, fast5_reads, threads=args.jobs)

        for result in results:
            if all(result):
                read_id, shift, scale = result
                writer.writerow([read_id, trim_start, trim_end, shift, scale])
Ejemplo n.º 7
0
 def test_strand_list_no_read_id_multiread(self):
     strand_list = os.path.join(self.STRAND_LIST_DIR,
                                "strand_list_no_read_id.txt")
     self._check_found_read_ids(
         iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=strand_list))
Ejemplo n.º 8
0
 def test_sequencing_summary_multiread(self):
     self._check_found_read_ids(
         iterate_fast5_reads(self.MULTIREAD_DIR,
                             strand_list=self.SEQUENCING_SUMMARY))
Ejemplo n.º 9
0
 def test_strand_list_no_filename_single_reads(self):
     strand_list = os.path.join(self.STRAND_LIST_DIR,
                                "strand_list_no_filename.txt")
     self._check_found_read_ids(
         iterate_fast5_reads(self.READ_DIR, strand_list=strand_list))
Ejemplo n.º 10
0
 def test_no_strand_list_multiread(self):
     self._check_found_read_ids(iterate_fast5_reads(self.MULTIREAD_DIR))
Ejemplo n.º 11
0
 def test_no_strand_list_single_reads(self):
     self._check_found_read_ids(iterate_fast5_reads(self.READ_DIR))
Ejemplo n.º 12
0
 def test_no_strand_list_multiread(self):
     """See if read ids found in multiread file with no strand list"""
     self._check_found_read_ids(iterate_fast5_reads(self.MULTIREAD_DIR))
Ejemplo n.º 13
0
def main():
    args = get_parser().parse_args()

    # TODO convert to logging

    sys.stderr.write("* Initializing reads file search.\n")
    fast5_reads = fast5utils.iterate_fast5_reads(
        args.input_folder, limit=args.limit,
        strand_list=args.input_strand_list, recursive=args.recursive)

    if args.scaling is not None:
        sys.stderr.write(
            "* Loading read scaling parameters from {}.\n".format(
                args.scaling))
        all_read_params = get_per_read_params_dict_from_tsv(args.scaling)
        input_read_ids = frozenset(rec[1] for rec in fast5_reads)
        scaling_read_ids = frozenset(all_read_params.keys())
        sys.stderr.write("* {} / {} reads have scaling information.\n".format(
            len(input_read_ids & scaling_read_ids), len(input_read_ids)))
        fast5_reads = [rec for rec in fast5_reads if rec[
            1] in scaling_read_ids]
    else:
        all_read_params = {}

    sys.stderr.write("* Calling reads.\n")
    nbase, ncalled, nread, nsample = 0, 0, 0, 0
    t0 = time.time()
    progress = Progress(quiet=args.quiet)
    startcharacter = '@' if args.fastq else '>'
    initargs = [args.device, args.model, args.chunk_size, args.overlap,
                all_read_params, args.alphabet,
                args.max_concurrent_chunks, args.fastq, args.qscore_scale,
                args.qscore_offset, args.beam, args.posterior,
                args.temperature]
    pool = Pool(args.jobs, initializer=worker_init, initargs=initargs)
    with open_file_or_stdout(args.output) as fh:
        for read_id, basecall, qstring, read_nsample in \
                pool.imap_unordered(worker, fast5_reads):
            if basecall is not None and len(basecall) > 0:
                fh.write("{}{}\n{}\n".format(
                    startcharacter, read_id,
                    basecall[::-1] if args.reverse else basecall))
                nbase += len(basecall)
                ncalled += 1
                if args.fastq:
                    fh.write("+\n{}\n".format(
                        qstring[::-1] if args.reverse else qstring))

            nread += 1
            nsample += read_nsample
            progress.step()
    total_time = time.time() - t0

    sys.stderr.write(
        "* Called {} reads in {:.2f}s\n".format(nread, int(total_time)))
    sys.stderr.write(
        "* {:7.2f} kbase / s\n".format(nbase / total_time / 1000.0))
    sys.stderr.write(
        "* {:7.2f} ksample / s\n".format(nsample / total_time / 1000.0))
    sys.stderr.write("* {} reads failed.\n".format(nread - ncalled))
    return
Ejemplo n.º 14
0
def main():
    args = parser.parse_args()

    device = helpers.set_torch_device(args.device)
    # TODO convert to logging
    sys.stderr.write("* Loading model.\n")
    model = load_model(args.model).to(device)
    is_cat_mod = isinstance(model.sublayers[-1],
                            layers.GlobalNormFlipFlopCatMod)
    do_output_mods = args.modified_base_output is not None
    if do_output_mods and not is_cat_mod:
        sys.stderr.write(
            "Cannot output modified bases from canonical base only model.")
        sys.exit()
    n_can_states = nstate_flipflop(model.sublayers[-1].nbase)
    stride = guess_model_stride(model)
    chunk_size = args.chunk_size * stride
    chunk_overlap = args.overlap * stride

    sys.stderr.write("* Initializing reads file search.\n")
    fast5_reads = list(
        fast5utils.iterate_fast5_reads(args.input_folder,
                                       limit=args.limit,
                                       strand_list=args.input_strand_list,
                                       recursive=args.recursive))
    sys.stderr.write("* Found {} reads.\n".format(len(fast5_reads)))

    if args.scaling is not None:
        sys.stderr.write("* Loading read scaling parameters from {}.\n".format(
            args.scaling))
        all_read_params = get_per_read_params_dict_from_tsv(args.scaling)
        input_read_ids = frozenset(rec[1] for rec in fast5_reads)
        scaling_read_ids = frozenset(all_read_params.keys())
        sys.stderr.write("* {} / {} reads have scaling information.\n".format(
            len(input_read_ids & scaling_read_ids), len(input_read_ids)))
        fast5_reads = [
            rec for rec in fast5_reads if rec[1] in scaling_read_ids
        ]
    else:
        all_read_params = {}

    mods_fp = None
    if do_output_mods:
        mods_fp = h5py.File(args.modified_base_output)
        mods_fp.create_group('Reads')
        mod_long_names = model.sublayers[-1].ordered_mod_long_names
        sys.stderr.write("* Preparing modified base output: {}.\n".format(
            ', '.join(map(str, mod_long_names))))
        mods_fp.create_dataset('mod_long_names',
                               data=np.array(mod_long_names, dtype='S'),
                               dtype=h5py.special_dtype(vlen=str))

    sys.stderr.write("* Calling reads.\n")
    nbase, ncalled, nread, nsample = 0, 0, 0, 0
    t0 = time.time()
    progress = Progress(quiet=args.quiet)
    startcharacter = '@' if args.fastq else '>'
    try:
        with open_file_or_stdout(args.output) as fh:
            for read_filename, read_id in fast5_reads:
                read_params = all_read_params[
                    read_id] if read_id in all_read_params else None
                basecall, qstring, read_nsample = process_read(
                    read_filename, read_id, model, chunk_size, chunk_overlap,
                    read_params, n_can_states, stride, args.alphabet,
                    is_cat_mod, mods_fp, args.max_concurrent_chunks,
                    args.fastq, args.qscore_scale, args.qscore_offset)
                if basecall is not None:
                    fh.write("{}{}\n{}\n".format(
                        startcharacter, read_id,
                        basecall[::-1] if args.reverse else basecall))
                    nbase += len(basecall)
                    ncalled += 1
                    if args.fastq:
                        fh.write("+\n{}\n".format(
                            qstring[::-1] if args.reverse else qstring))
                nread += 1
                nsample += read_nsample
                progress.step()
    finally:
        if mods_fp is not None:
            mods_fp.close()
    total_time = time.time() - t0

    sys.stderr.write("* Called {} reads in {:.2f}s\n".format(
        nread, int(total_time)))
    sys.stderr.write("* {:7.2f} kbase / s\n".format(nbase / total_time /
                                                    1000.0))
    sys.stderr.write("* {:7.2f} ksample / s\n".format(nsample / total_time /
                                                      1000.0))
    sys.stderr.write("* {} reads failed.\n".format(nread - ncalled))
    return
Ejemplo n.º 15
0
 def test_no_strand_list_single_reads(self):
     """See if read ids found in single-read file with no strand list"""
     self._check_found_read_ids(iterate_fast5_reads(self.READ_DIR))
Ejemplo n.º 16
0
                    metavar=('beginning', 'end'),
                    help='Number of samples to trim off start and end')
parser.add_argument('model', action=FileExists, help='Model file')
parser.add_argument('references', action=FileExists, help='Fasta file')
parser.add_argument('read_dir',
                    action=FileExists,
                    help='Directory for fast5 reads')

if __name__ == '__main__':
    args = parser.parse_args()

    worker_kwarg_names = ['back_prob', 'localpen', 'minscore', 'trim']

    model = helpers.load_model(args.model)

    fast5_reads = fast5utils.iterate_fast5_reads(
        args.read_dir, limit=args.limit, strand_list=args.input_strand_list)

    for res in imap_mp(squiggle_match.worker,
                       fast5_reads,
                       threads=args.jobs,
                       fix_kwargs=helpers.get_kwargs(args, worker_kwarg_names),
                       unordered=True,
                       init=squiggle_match.init_worker,
                       initargs=[model, args.references]):
        if res is None:
            continue
        read_id, sig, score, path, squiggle, bases = res
        bases = bases.decode('ascii')
        print('#{} {}'.format(read_id, score))
        for i, (s, p) in enumerate(zip(sig, path)):
            print('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(read_id, i, s, p,
Ejemplo n.º 17
0
 def test_sequencing_summary_multiread(self):
     """See if read ids found using sequencing-summary style strand list"""
     self._check_found_read_ids(
         iterate_fast5_reads(self.MULTIREAD_DIR,
                             strand_list=self.SEQUENCING_SUMMARY))
Ejemplo n.º 18
0
 def test_strand_list_multiread(self):
     """See if read ids found using strand list with multi read fast5s"""
     strand_list = os.path.join(self.STRAND_LIST_DIR, "strand_list.txt")
     self._check_found_read_ids(
         iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=strand_list))
Ejemplo n.º 19
0
def main():
    """Main function to process mapping for each read using functions in prepare_mapping_funcs"""
    args = parser.parse_args()
    print("Running prepare_mapping using flip-flop remapping")

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    # Create alphabet and check for consistency
    modified_bases = [elt[0] for elt in args.mod]
    canonical_bases = [elt[1] for elt in args.mod]
    for b in modified_bases:
        assert len(
            b
        ) == 1, "Modified bases must be a single character, got {}".format(b)
        assert b not in args.alphabet, "Modified base must not be a canonical base, got {}".format(
            b)
    for b in canonical_bases:
        assert len(
            b
        ) == 1, "Canonical coding for modified bases must be a single character, got {}".format(
            b)
        assert b in args.alphabet, "Canonical coding for modified base must be a canonical base, got {}".format(
            b)
    full_alphabet = args.alphabet + ''.join(modified_bases)
    flat_alphabet = args.alphabet + ''.join(canonical_bases)
    modification_names = [elt[2] for elt in args.mod]

    alphabet_info = alphabet.AlphabetInfo(full_alphabet,
                                          flat_alphabet,
                                          modification_names,
                                          do_reorder=True)

    print("Converting references to labels using {}".format(
        str(alphabet_info)))

    # Make an iterator that yields all the reads we're interested in.
    fast5_reads = fast5utils.iterate_fast5_reads(
        args.input_folder,
        limit=args.limit,
        strand_list=args.input_strand_list,
        recursive=args.recursive)

    # Set up arguments (kwargs) for the worker function for each read
    kwargs = {}
    kwargs[
        'per_read_params_dict'] = prepare_mapping_funcs.get_per_read_params_dict_from_tsv(
            args.input_per_read_params)
    kwargs['model'] = helpers.load_model(args.model)
    kwargs['alphabet_info'] = alphabet_info
    kwargs['max_read_length'] = args.max_read_length
    kwargs['localpen'] = args.localpen

    # remaps a single read using flip-flip network
    workerFunction = prepare_mapping_funcs.oneread_remap

    def iter_jobs():
        references = bio.fasta_file_to_dict(args.references,
                                            alphabet=full_alphabet)
        for fn, read_id in fast5_reads:
            yield fn, read_id, references.get(read_id, None)

    if args.limit is not None:
        chunksize = args.limit // (2 * args.jobs)
        chunksize = int(np.clip(chunksize, 1, 50))
    else:
        chunksize = 50

    results = imap_mp(workerFunction,
                      iter_jobs(),
                      threads=args.jobs,
                      fix_kwargs=kwargs,
                      unordered=True,
                      chunksize=chunksize)

    # results is an iterable of dicts
    # each dict is a set of return values from a single read
    prepare_mapping_funcs.generate_output_from_results(results, args.output,
                                                       alphabet_info)
Ejemplo n.º 20
0
def main():
    args = parser.parse_args()

    assert args.device != 'cpu', "Flipflop basecalling in taiyaki requires a GPU and for cupy to be installed"
    device = torch.device(args.device)
    # TODO convert to logging
    sys.stderr.write("* Loading model.\n")
    model = load_model(args.model).to(device)
    is_cat_mod = isinstance(model.sublayers[-1], layers.GlobalNormFlipFlopCatMod)
    do_output_mods = args.modified_base_output is not None
    if do_output_mods and not is_cat_mod:
        sys.stderr.write(
            "Cannot output modified bases from canonical base only model.")
        sys.exit()
    n_can_states = nstate_flipflop(model.sublayers[-1].nbase)
    stride = guess_model_stride(model, device=device)
    chunk_size, chunk_overlap = basecall_helpers.round_chunk_values(
        args.chunk_size, args.overlap, stride)

    sys.stderr.write("* Initializing reads file search.\n")
    fast5_reads = fast5utils.iterate_fast5_reads(
        args.input_folder, limit=args.limit, strand_list=args.input_strand_list,
        recursive=args.recursive)

    mods_fp = None
    if do_output_mods:
        mods_fp = h5py.File(args.modified_base_output)
        mods_fp.create_group('Reads')
        mod_long_names = model.sublayers[-1].ordered_mod_long_names
        sys.stderr.write("* Preparing modified base output: {}.\n".format(
            ', '.join(map(str, mod_long_names))))
        mods_fp.create_dataset(
            'mod_long_names', data=np.array(mod_long_names, dtype='S'),
            dtype=h5py.special_dtype(vlen=str))

    sys.stderr.write("* Calling reads.\n")
    nbase, ncalled, nread, nsample = 0, 0, 0, 0
    t0 = time.time()
    progress = Progress(quiet=args.quiet)
    try:
        with open_file_or_stdout(args.output) as fh:
            for read_filename, read_id in fast5_reads:
                basecall, read_nsample = process_read(
                    read_filename, read_id, model, chunk_size,
                    chunk_overlap, device, n_can_states, stride, args.alphabet,
                    is_cat_mod, mods_fp)
                if basecall is not None:
                    fh.write(">{}\n{}\n".format(read_id, basecall))
                    nbase += len(basecall)
                    ncalled += 1
                nread += 1
                nsample += read_nsample
                progress.step()
    finally:
        if mods_fp is not None:
            mods_fp.close()
    total_time = time.time() - t0

    sys.stderr.write("* Called {} reads in {}s\n".format(nread, int(total_time)))
    sys.stderr.write("* {:7.2f} kbase / s\n".format(nbase / total_time / 1000.0))
    sys.stderr.write("* {:7.2f} ksample / s\n".format(nsample / total_time / 1000.0))
    sys.stderr.write("* {} reads failed.\n".format(nread - ncalled))
    return