def test_002_from_file(self):
     tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
     with open(tmp_file, 'w') as fh:
         fh.write('filename\tjunk\n')
         for i, fname in enumerate(iterate_fast5(self.path, paths=True)):
             fh.write('{}\t{}\n'.format(os.path.basename(fname), i))
     fnames = list(
         iterate_fast5(self.path, paths=True, strand_list=tmp_file))
     self.assertEqual(len(fnames), 3)
Example #2
0
def raw_chunkify_with_identity_main(args):
    """ Main function for `chunkify.py raw_identity` producing batch file for model training
    """
    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit,
                                strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = ['chunk_len', 'kmer_len', 'min_length', 'trim', 'normalisation', 'downsample_factor', 'interpolation']
    i = 0
    bad_list = []
    chunk_list = []
    label_list = []
    for res in imap_mp(raw_chunk_worker, fast5_files, threads=args.jobs,
                       unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names),
                       init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            (chunks, labels, bad_ev) = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'downsample_factor': args.downsample_factor,
            'input_type': 'raw',
            'interpolation': args.interpolation,
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'section': 'template',
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list])
        blanks = np.percentile(blanks_per_chunk, args.blanks_percentile)
        util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
Example #3
0
def chunkify_with_identity_main(args):

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit,
                                strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = ['section', 'chunk_len', 'kmer_len', 'min_length', 'trim', 'use_scaled', 'normalisation']
    i = 0
    bad_list = []
    chunk_list = []
    label_list = []
    for res in imap_mp(batch.chunk_worker, fast5_files, threads=args.jobs,
                       unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names),
                       init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            (chunks, labels, bad_ev) = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'input_type': 'events',
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'scaled': args.use_scaled,
            'section': args.section,
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        util.create_labelled_chunks_hdf5(args.output, args.blanks, hdf5_attributes, chunk_list, label_list, bad_list)
Example #4
0
def main(argv):
    args = parser.parse_args(argv[1:])

    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)

    fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit,
                                strand_list=args.input_strand_list)

    print('* Processing data using', args.jobs, 'threads')

    i = 0
    kwarg_names = ['section']
    with open(args.output, 'w') as file_handle:
        for res in imap_mp(reference_extraction_worker, fast5_files, threads=args.jobs, unordered=True,
                           fix_kwargs=util.get_kwargs(args, kwarg_names)):
            if res is not None:
                i = util.progress_report(i)
                file_name, reference = res
                header = '>{}\n'.format(os.path.basename(os.path.splitext(file_name)[0]))
                file_handle.write(header)
                file_handle.write(reference + '\n')
Example #5
0
 def test_iterate_works_with_strandlist(self):
     fast5_files = set(
         iterate_fast5(self.readdir,
                       paths=True,
                       strand_list=self.strand_list))
     self.assertTrue(self.strands == fast5_files)
Example #6
0
 def test_iterate_respects_limits(self):
     _LIMIT = 2
     fast5_files = set(iterate_fast5(self.readdir, paths=True,
                                     limit=_LIMIT))
     self.assertTrue(len(fast5_files) == _LIMIT)
Example #7
0
 def test_iterate_returns_all(self):
     fast5_files = set(iterate_fast5(self.readdir, paths=True))
     dir_list = set(glob.glob(os.path.join(self.readdir, '*.fast5')))
     self.assertTrue(fast5_files == dir_list)
Example #8
0
def raw_chunkify_with_remap_main(args):
    """ Main function for `chunkify.py raw_remap` producing batch file for model training
    """
    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)
        if os.path.exists(args.output_strand_list):
            print("Cowardly refusing to overwrite {}".format(args.output_strand_list))
            sys.exit(2)

    fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit,
                                strand_list=args.input_strand_list)

    references = util.fasta_file_to_dict(args.references)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = ['trim', 'min_prob', 'kmer_len', 'min_length',
                   'prior', 'slip', 'chunk_len', 'normalisation', 'downsample_factor',
                   'interpolation', 'open_pore_fraction']
    kwargs = util.get_kwargs(args, kwarg_names)
    kwargs['references'] = references

    i = 0
    compiled_file = helpers.compile_model(args.model, args.compile)
    bad_list = []
    chunk_list = []
    label_list = []

    if not os.path.isfile(args.output_strand_list):
        header_line = '\t'.join(['filename', 'nblocks', 'score', 'nstay', 'seqlen', 'start', 'end']) + '\n'
        with open(args.output_strand_list, 'wt') as slfh:
            slfh.write(header_line)

    for res in imap_mp(raw_chunk_remap_worker, fast5_files, threads=args.jobs,
                    fix_kwargs=kwargs, unordered=True, init=batch.init_chunk_remap_worker,
                    initargs=[compiled_file, args.kmer_len, args.alphabet]):
        if res is not None:
            i = util.progress_report(i)

            read, score, nblocks, path, seq, chunks, labels, bad_ev = res

            chunk_list.append(chunks)
            label_list.append(labels)
            bad_list.append(bad_ev)
            strand_data = [read, nblocks, -score / nblocks,
                           np.sum(np.ediff1d(path, to_begin=1) == 0),
                           len(seq), min(path), max(path)]

            data_line = '\t'.join([str(x) for x in strand_data]) + '\n'
            with open(args.output_strand_list, 'at') as slfh:
                slfh.write(data_line)

    if compiled_file != args.compile:
        os.remove(compiled_file)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'downsample_factor': args.downsample_factor,
            'input_type': 'raw',
            'interpolation': args.interpolation,
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'section': 'template',
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list])
        blanks = np.percentile(blanks_per_chunk, args.blanks_percentile)
        util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
 def test_001_recursive(self):
     fnames = list(iterate_fast5(self.path, paths=True, recursive=True))
     self.assertEqual(len(fnames), 5)
 def test_000_single_layer(self):
     fnames = list(iterate_fast5(self.path, paths=True))
     self.assertEqual(len(fnames), 3)
Example #11
0
        ]
    else:
        kwarg_names = [
            'trim', 'open_pore_fraction', 'kmer_len', 'transducer', 'bad',
            'min_prob', 'skip', 'trans', 'alphabet'
        ]

    compiled_file = helpers.compile_model(args.model, args.compile)

    seq_printer = basecall.SeqPrinter(args.kmer_len,
                                      datatype=args.datatype,
                                      transducer=args.transducer,
                                      alphabet=args.alphabet.decode('ascii'))

    files = iterate_fast5(args.input_folder,
                          paths=True,
                          limit=args.limit,
                          strand_list=args.input_strand_list)
    nbases = nevents = 0
    t0 = time.time()
    for res in imap_mp(basecall_worker,
                       files,
                       threads=args.jobs,
                       fix_kwargs=util.get_kwargs(args, kwarg_names),
                       unordered=True,
                       init=basecall.init_worker,
                       initargs=[compiled_file]):
        if res is None:
            continue
        read, score, call, nev = res
        seq_len = seq_printer.write(read, score, call, nev)
        nbases += seq_len