def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5, ccs.bam or ccs FOFN. """ ccs_fns = get_files_from_file_or_fofn(ccs_fofn) fmt = guess_file_format(ccs_fns) if fmt == FILE_FORMATS.H5: qver = basQVcacher() for ccs_fn in ccs_fns: qver.add_bash5(ccs_fn) bas_handlers = {} elif fmt == FILE_FORMATS.BAM: qver = BamCollection(*ccs_fns) else: raise IOError("ice_fa2fq does not support input %s." % ccs_fofn) with ContigSetReaderWrapper(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: logging.debug("Getting QVs for {name} ...".format(name=r.name)) seqid = r.name.split(' ')[0] parsed_read_name = _Parsed_Read_Name(seqid) if fmt == FILE_FORMATS.H5: try: bas_file = qver.bas_files[parsed_read_name.movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from {f}.".format( s=seqid, f=ccs_fofn)) qvs = get_qv_from_bas_handler( bas_handler=bas_handlers[bas_file], parsed_read_name=parsed_read_name, qv_name="QualityValue") elif fmt == FILE_FORMATS.BAM: qvs = get_qvs_from_bam(reader=qver, parsed_read_name=parsed_read_name, qv_name="QualityValue") else: assert False if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence[:], qvs) if fmt == FILE_FORMATS.H5: for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close() elif fmt == FILE_FORMATS.BAM: qver.close()
def test_bam(self): qver = basQVcacher() qver.add_bash5(CCS_BAM) seqids = [ rid for rid in _get_read_ids() ] qver.precache(seqids) qvs = [] for read_id in seqids: qvs.append(qver.get(read_id, "InsertionQV")) dqv = qver.get(READ_ID, "DeletionQV") self.assertEqual("%.5f" % dqv[0], "0.01995") #print dqv[100] self.assertEqual(len(qvs), 251)
def test_bam(self): qver = basQVcacher() qver.add_bash5(CCS_BAM) seqids = [rid for rid in _get_read_ids()] qver.precache(seqids) qvs = [] for read_id in seqids: qvs.append(qver.get(read_id, "InsertionQV")) dqv = qver.get(READ_ID, "DeletionQV") self.assertEqual("%.5f" % dqv[0], "0.01995") #print dqv[100] self.assertEqual(len(qvs), 251)
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5, ccs.bam or ccs FOFN. """ ccs_fns = get_files_from_file_or_fofn(ccs_fofn) fmt = guess_file_format(ccs_fns) if fmt == FILE_FORMATS.H5: qver = basQVcacher() for ccs_fn in ccs_fns: qver.add_bash5(ccs_fn) bas_handlers = {} elif fmt == FILE_FORMATS.BAM: qver = BamCollection(*ccs_fns) else: raise IOError("ice_fa2fq does not support input %s." % ccs_fofn) with ContigSetReaderWrapper(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: logging.debug("Getting QVs for {name} ...".format(name=r.name)) seqid = r.name.split(' ')[0] parsed_read_name = _Parsed_Read_Name(seqid) if fmt == FILE_FORMATS.H5: try: bas_file = qver.bas_files[parsed_read_name.movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from {f}.". format(s=seqid, f=ccs_fofn)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], parsed_read_name=parsed_read_name, qv_name="QualityValue") elif fmt == FILE_FORMATS.BAM: qvs = get_qvs_from_bam(reader=qver, parsed_read_name=parsed_read_name, qv_name="QualityValue") else: assert False if len(r.sequence) != len(qvs): raise ValueError("Sequence and QVs of {r} should be the same!". format(r=r.name)) writer.writeRecord(r.name, r.sequence[:], qvs) if fmt == FILE_FORMATS.H5: for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close() elif fmt == FILE_FORMATS.BAM: qver.close()