Ejemplo n.º 1
0
def raw_chunkify_with_identity_main(args):
    """ Main function for `chunkify.py raw_identity` producing batch file for model training
    """
    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = fast5.iterate_fast5(args.input_folder,
                                      paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = [
        'chunk_len', 'kmer_len', 'min_length', 'trim', 'normalisation',
        'downsample_factor', 'interpolation'
    ]
    i = 0
    bad_list = []
    chunk_list = []
    label_list = []
    for res in imap_mp(raw_chunk_worker,
                       fast5_files,
                       threads=args.jobs,
                       unordered=True,
                       fix_kwargs=util.get_kwargs(args, kwarg_names),
                       init=batch.init_chunk_identity_worker,
                       initargs=[args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            (chunks, labels, bad_ev) = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'downsample_factor': args.downsample_factor,
            'input_type': 'raw',
            'interpolation': args.interpolation,
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'section': 'template',
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        blanks_per_chunk = np.concatenate([(l == 0).mean(1)
                                           for l in label_list])
        blanks = np.percentile(blanks_per_chunk, args.blanks_percentile)
        util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes,
                                         chunk_list, label_list, bad_list)
Ejemplo n.º 2
0
def main(argv):
    args = parser.parse_args(argv[1:])

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = fast5.iterate_fast5(args.input_folder,
                                      paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    i = 0
    kwarg_names = ['section']
    with open(args.output, 'w') as file_handle:
        for res in imap_mp(reference_extraction_worker,
                           fast5_files,
                           threads=args.jobs,
                           unordered=True,
                           fix_kwargs=util.get_kwargs(args, kwarg_names)):
            if res is not None:
                i = util.progress_report(i)
                file_name, reference = res
                header = '>{}\n'.format(
                    os.path.basename(os.path.splitext(file_name)[0]))
                file_handle.write(header)
                file_handle.write(reference + '\n')
Ejemplo n.º 3
0
def chunkify_with_identity_main(args):

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = fast5.iterate_fast5(args.input_folder, paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = ['section', 'chunk_len', 'kmer_len', 'min_length', 'trim', 'use_scaled', 'normalisation']
    i = 0
    bad_list = []
    chunk_list = []
    label_list = []
    for res in imap_mp(batch.chunk_worker, fast5_files, threads=args.jobs,
                       unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names),
                       init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            (chunks, labels, bad_ev) = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'input_type': 'events',
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'scaled': args.use_scaled,
            'section': args.section,
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        util.create_labelled_chunks_hdf5(args.output, args.blanks, hdf5_attributes, chunk_list, label_list, bad_list)
Ejemplo n.º 4
0
def chunkify_with_remap_main(args):

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)
        if os.path.exists(args.output_strand_list):
            print("Cowardly refusing to overwrite {}".format(
                args.output_strand_list))
            sys.exit(2)

    fast5_files = fast5.iterate_fast5(args.input_folder,
                                      paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    references = util.fasta_file_to_dict(args.references)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = [
        'trim', 'min_prob', 'kmer_len', 'min_length', 'prior', 'slip',
        'chunk_len', 'use_scaled', 'normalisation', 'section', 'segmentation'
    ]
    kwargs = util.get_kwargs(args, kwarg_names)
    kwargs['references'] = references

    i = 0
    compiled_file = helpers.compile_model(args.model, args.compile)
    bad_list = []
    chunk_list = []
    label_list = []

    if not os.path.isfile(args.output_strand_list):
        header_line = '\t'.join([
            'filename', 'nev', 'score', 'nstay', 'seqlen', 'start', 'end'
        ]) + u'\n'
        with open(args.output_strand_list, 'wt') as slfh:
            slfh.write(header_line)

    for res in imap_mp(batch.chunk_remap_worker,
                       fast5_files,
                       threads=args.jobs,
                       fix_kwargs=kwargs,
                       unordered=True,
                       init=batch.init_chunk_remap_worker,
                       initargs=[compiled_file, args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            read, score, nev, path, seq, chunks, labels, bad_ev = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)
            strand_data = [
                read, nev, -score / nev,
                np.sum(np.ediff1d(path, to_begin=1) == 0),
                len(seq),
                min(path),
                max(path)
            ]

            data_line = '\t'.join([str(x) for x in strand_data]) + '\n'
            with open(args.output_strand_list, 'at') as slfh:
                slfh.write(data_line)

    if compiled_file != args.compile:
        os.remove(compiled_file)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Creating HDF5 file')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'input_type': 'events',
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'scaled': args.use_scaled,
            'section': args.section,
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        util.create_labelled_chunks_hdf5(args.output, args.blanks,
                                         hdf5_attributes, chunk_list,
                                         label_list, bad_list)
Ejemplo n.º 5
0
def raw_chunkify_with_remap_main(args):
    """ Main function for `chunkify.py raw_remap` producing batch file for model training
    """
    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)
        if os.path.exists(args.output_strand_list):
            print("Cowardly refusing to overwrite {}".format(args.output_strand_list))
            sys.exit(2)

    fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit,
                                strand_list=args.input_strand_list)

    references = util.fasta_file_to_dict(args.references)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = ['trim', 'min_prob', 'kmer_len', 'min_length',
                   'prior', 'slip', 'chunk_len', 'normalisation', 'downsample_factor',
                   'interpolation', 'open_pore_fraction']
    kwargs = util.get_kwargs(args, kwarg_names)
    kwargs['references'] = references

    i = 0
    compiled_file = helpers.compile_model(args.model, args.compile)
    bad_list = []
    chunk_list = []
    label_list = []

    if not os.path.isfile(args.output_strand_list):
        header_line = '\t'.join(['filename', 'nblocks', 'score', 'nstay', 'seqlen', 'start', 'end']) + '\n'
        with open(args.output_strand_list, 'wt') as slfh:
            slfh.write(header_line)

    for res in imap_mp(raw_chunk_remap_worker, fast5_files, threads=args.jobs,
                    fix_kwargs=kwargs, unordered=True, init=batch.init_chunk_remap_worker,
                    initargs=[compiled_file, args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            read, score, nblocks, path, seq, chunks, labels, bad_ev = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)
            strand_data = [read, nblocks, -score / nblocks,
                           np.sum(np.ediff1d(path, to_begin=1) == 0),
                           len(seq), min(path), max(path)]

            data_line = '\t'.join([str(x) for x in strand_data]) + '\n'
            with open(args.output_strand_list, 'at') as slfh:
                slfh.write(data_line)

    if compiled_file != args.compile:
        os.remove(compiled_file)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'downsample_factor': args.downsample_factor,
            'input_type': 'raw',
            'interpolation': args.interpolation,
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'section': 'template',
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list])
        blanks = np.percentile(blanks_per_chunk, args.blanks_percentile)
        util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
Ejemplo n.º 6
0
    basecall_worker = getattr(basecall, args.command + "_worker")
    if args.command == "events":
        kwarg_names = ['section', 'segmentation', 'trim', 'kmer_len', 'transducer', 'bad', 'min_prob', 'skip', 'trans', 'alphabet']
    else:
        kwarg_names = ['trim', 'open_pore_fraction', 'kmer_len', 'transducer', 'bad', 'min_prob', 'skip', 'trans', 'alphabet']

    compiled_file = helpers.compile_model(args.model, args.compile)

    seq_printer = basecall.SeqPrinter(args.kmer_len, datatype=args.datatype,
                                      transducer=args.transducer, alphabet=args.alphabet.decode('ascii'))

    files = fast5.iterate_fast5(args.input_folder, paths=True, limit=args.limit,
                                strand_list=args.input_strand_list)
    nbases = nevents = 0
    t0 = time.time()
    for res in imap_mp(basecall_worker, files, threads=args.jobs, fix_kwargs=util.get_kwargs(args, kwarg_names),
                       unordered=True, init=basecall.init_worker, initargs=[compiled_file]):
        if res is None:
            continue
        read, score, call, nev = res
        seq_len = seq_printer.write(read, score, call, nev)
        nbases += seq_len
        nevents += nev

    dt = time.time() - t0
    t = 'Called {} bases in {:.1f} s ({:.1f} bases/s or {:.1f} {}/s)\n'
    sys.stderr.write(t.format(nbases, dt, nbases / dt, nevents / dt, args.datatype))

    if compiled_file != args.compile:
        os.remove(compiled_file)
Ejemplo n.º 7
0
    seq_printer = basecall.SeqPrinter(args.kmer_len,
                                      datatype=args.datatype,
                                      transducer=args.transducer,
                                      alphabet=args.alphabet.decode('ascii'))

    files = iterate_fast5(args.input_folder,
                          paths=True,
                          limit=args.limit,
                          strand_list=args.input_strand_list)
    nbases = nevents = 0
    t0 = time.time()
    for res in imap_mp(basecall_worker,
                       files,
                       threads=args.jobs,
                       fix_kwargs=util.get_kwargs(args, kwarg_names),
                       unordered=True,
                       init=basecall.init_worker,
                       initargs=[compiled_file]):
        if res is None:
            continue
        read, score, call, nev = res
        seq_len = seq_printer.write(read, score, call, nev)
        nbases += seq_len
        nevents += nev

    dt = time.time() - t0
    t = 'Called {} bases in {:.1f} s ({:.1f} bases/s or {:.1f} {}/s)\n'
    sys.stderr.write(
        t.format(nbases, dt, nbases / dt, nevents / dt, args.datatype))