Exemple #1
0
def raw_chunkify_with_identity_main(args):
    """ Main function for `chunkify.py raw_identity` producing batch file for model training
    """
    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = fast5.iterate_fast5(args.input_folder,
                                      paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = [
        'chunk_len', 'kmer_len', 'min_length', 'trim', 'normalisation',
        'downsample_factor', 'interpolation'
    ]
    i = 0
    bad_list = []
    chunk_list = []
    label_list = []
    for res in imap_mp(raw_chunk_worker,
                       fast5_files,
                       threads=args.jobs,
                       unordered=True,
                       fix_kwargs=util.get_kwargs(args, kwarg_names),
                       init=batch.init_chunk_identity_worker,
                       initargs=[args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            (chunks, labels, bad_ev) = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'downsample_factor': args.downsample_factor,
            'input_type': 'raw',
            'interpolation': args.interpolation,
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'section': 'template',
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        blanks_per_chunk = np.concatenate([(l == 0).mean(1)
                                           for l in label_list])
        blanks = np.percentile(blanks_per_chunk, args.blanks_percentile)
        util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes,
                                         chunk_list, label_list, bad_list)
def chunkify_with_identity_main(args):

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = fast5.iterate_fast5(args.input_folder, paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = ['section', 'chunk_len', 'kmer_len', 'min_length', 'trim', 'use_scaled', 'normalisation']
    i = 0
    bad_list = []
    chunk_list = []
    label_list = []
    for res in imap_mp(batch.chunk_worker, fast5_files, threads=args.jobs,
                       unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names),
                       init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            (chunks, labels, bad_ev) = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'input_type': 'events',
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'scaled': args.use_scaled,
            'section': args.section,
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        util.create_labelled_chunks_hdf5(args.output, args.blanks, hdf5_attributes, chunk_list, label_list, bad_list)
Exemple #3
0
def chunkify_with_remap_main(args):

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)
        if os.path.exists(args.output_strand_list):
            print("Cowardly refusing to overwrite {}".format(
                args.output_strand_list))
            sys.exit(2)

    fast5_files = fast5.iterate_fast5(args.input_folder,
                                      paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    references = util.fasta_file_to_dict(args.references)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = [
        'trim', 'min_prob', 'kmer_len', 'min_length', 'prior', 'slip',
        'chunk_len', 'use_scaled', 'normalisation', 'section', 'segmentation'
    ]
    kwargs = util.get_kwargs(args, kwarg_names)
    kwargs['references'] = references

    i = 0
    compiled_file = helpers.compile_model(args.model, args.compile)
    bad_list = []
    chunk_list = []
    label_list = []

    if not os.path.isfile(args.output_strand_list):
        header_line = '\t'.join([
            'filename', 'nev', 'score', 'nstay', 'seqlen', 'start', 'end'
        ]) + u'\n'
        with open(args.output_strand_list, 'wt') as slfh:
            slfh.write(header_line)

    for res in imap_mp(batch.chunk_remap_worker,
                       fast5_files,
                       threads=args.jobs,
                       fix_kwargs=kwargs,
                       unordered=True,
                       init=batch.init_chunk_remap_worker,
                       initargs=[compiled_file, args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            read, score, nev, path, seq, chunks, labels, bad_ev = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)
            strand_data = [
                read, nev, -score / nev,
                np.sum(np.ediff1d(path, to_begin=1) == 0),
                len(seq),
                min(path),
                max(path)
            ]

            data_line = '\t'.join([str(x) for x in strand_data]) + '\n'
            with open(args.output_strand_list, 'at') as slfh:
                slfh.write(data_line)

    if compiled_file != args.compile:
        os.remove(compiled_file)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Creating HDF5 file')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'input_type': 'events',
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'scaled': args.use_scaled,
            'section': args.section,
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        util.create_labelled_chunks_hdf5(args.output, args.blanks,
                                         hdf5_attributes, chunk_list,
                                         label_list, bad_list)
Exemple #4
0
def raw_chunkify_with_remap_main(args):
    """ Main function for `chunkify.py raw_remap` producing batch file for model training
    """
    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)
        if os.path.exists(args.output_strand_list):
            print("Cowardly refusing to overwrite {}".format(args.output_strand_list))
            sys.exit(2)

    fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit,
                                strand_list=args.input_strand_list)

    references = util.fasta_file_to_dict(args.references)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = ['trim', 'min_prob', 'kmer_len', 'min_length',
                   'prior', 'slip', 'chunk_len', 'normalisation', 'downsample_factor',
                   'interpolation', 'open_pore_fraction']
    kwargs = util.get_kwargs(args, kwarg_names)
    kwargs['references'] = references

    i = 0
    compiled_file = helpers.compile_model(args.model, args.compile)
    bad_list = []
    chunk_list = []
    label_list = []

    if not os.path.isfile(args.output_strand_list):
        header_line = '\t'.join(['filename', 'nblocks', 'score', 'nstay', 'seqlen', 'start', 'end']) + '\n'
        with open(args.output_strand_list, 'wt') as slfh:
            slfh.write(header_line)

    for res in imap_mp(raw_chunk_remap_worker, fast5_files, threads=args.jobs,
                    fix_kwargs=kwargs, unordered=True, init=batch.init_chunk_remap_worker,
                    initargs=[compiled_file, args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            read, score, nblocks, path, seq, chunks, labels, bad_ev = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)
            strand_data = [read, nblocks, -score / nblocks,
                           np.sum(np.ediff1d(path, to_begin=1) == 0),
                           len(seq), min(path), max(path)]

            data_line = '\t'.join([str(x) for x in strand_data]) + '\n'
            with open(args.output_strand_list, 'at') as slfh:
                slfh.write(data_line)

    if compiled_file != args.compile:
        os.remove(compiled_file)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'downsample_factor': args.downsample_factor,
            'input_type': 'raw',
            'interpolation': args.interpolation,
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'section': 'template',
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list])
        blanks = np.percentile(blanks_per_chunk, args.blanks_percentile)
        util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)