def concat_bam(in_fns, out_fn): """Concat input bam files to an output bam file. Note that each input bam has ONLY one reference sequence. """ # construct sam header h = concat_bam_header(in_fns) o = BamWriter(out_fn, header=h) for index, in_fn in enumerate(in_fns): s = Samfile(in_fn, 'rb') for r in s: r.tid = index # Overwrite tid !!! o.write(r) s.close() o.close()
def concat_bam(in_fns, out_fn): """Concat input bam files to an output bam file. Note that each input bam has ONLY one reference sequence. """ # construct sam header h = concat_bam_header(in_fns) o = BamWriter(out_fn, header=h) for index, in_fn in enumerate(in_fns): s = pysam.Samfile(in_fn, 'rb') for r in s: r.tid = index # Overwrite tid !!! o.write(r) s.close() o.close()
def trim_subreads_and_write(reader, in_seqids, out_file, trim_len, min_len, ignore_keyerror=False, bam=False): """ Extract (dump) raw subreads of every zmws from in_seqeids from reader to out_file. reader --- provides random access to raw subreads in input file. type = MetaSubreadFastaReader, when input files are in FASTA, and reads are in format <movie>/<holeNumber>/<subread or CCS>. type = BamCollection, when input files are in BAM. trim_len --- trim the first and last n bases when input is BAM min_len --- minimum read length to write a subread when input is BAM in_seqids --- zmw ids to dump out_file --- a FASTA file when input files are in FASTA; a BAM file when input files are in BAM. return movies seen """ movies = set() zmw_seen = set() f = None # output open file handler if bam: assert isinstance(reader, BamCollection) f = BamWriter(out_file, reader.header) else: assert isinstance(reader, MetaSubreadFastaReader) f = FastaWriter(out_file) for seqid in in_seqids: zmw = seqid try: zmw = '/'.join(seqid.split('/')[0:2]) except ValueError: raise ValueError("%s does not contain a valid pacbio zmw id." % seqid) if zmw not in zmw_seen: movies.add(zmw.split('/')[0]) zmw_seen.add(zmw) try: if bam: for rec in reader[zmw].subreads: if len(rec) >= 2 * trim_len + min_len: f.write( rec.Clip(rec.readStart + trim_len, rec.readEnd - trim_len)) else: for rec in reader[zmw]: if len(rec) >= 2 * trim_len + min_len: try: m, hn, s_e = rec.name.split('/') s, e = [int(x) for x in s_e.split('_')] new_id = "%s/%s/%d_%d" % (m, hn, s + trim_len, e - trim_len) f.writeRecord(new_id, rec.sequence[trim_len:-trim_len]) except ValueError: raise ValueError( "%s is not a valid pacbio subread." % rec.name) except KeyError: if ignore_keyerror: logging.warning( "Ignoring {zmw} because the input FASTA/BAM ".format( zmw=zmw) + " does not contain it.") else: raise ValueError("{0} doesn't exist. Abort!".format(zmw)) f.close() return movies
def trim_subreads_and_write(reader, in_seqids, out_file, trim_len, min_len, ignore_keyerror=False, bam=False): """ Extract (dump) raw subreads of every zmws from in_seqeids from reader to out_file. reader --- provides random access to raw subreads in input file. type = MetaSubreadFastaReader, when input files are in FASTA, and reads are in format <movie>/<holeNumber>/<subread or CCS>. type = BamCollection, when input files are in BAM. trim_len --- trim the first and last n bases when input is BAM min_len --- minimum read length to write a subread when input is BAM in_seqids --- zmw ids to dump out_file --- a FASTA file when input files are in FASTA; a BAM file when input files are in BAM. return movies seen """ movies = set() zmw_seen = set() f = None # output open file handler if bam: assert isinstance(reader, BamCollection) f = BamWriter(out_file, reader.header) else: assert isinstance(reader, MetaSubreadFastaReader) f = FastaWriter(out_file) for seqid in in_seqids: zmw = seqid try: zmw = '/'.join(seqid.split('/')[0:2]) except ValueError: raise ValueError("%s does not contain a valid pacbio zmw id." % seqid) if zmw not in zmw_seen: movies.add(zmw.split('/')[0]) zmw_seen.add(zmw) try: if bam: for rec in reader[zmw].subreads: if len(rec) >= 2*trim_len + min_len: f.write(rec.Clip(rec.readStart+trim_len, rec.readEnd-trim_len)) else: for rec in reader[zmw]: if len(rec) >= 2*trim_len + min_len: try: m, hn, s_e = rec.name.split('/') s, e = [int(x) for x in s_e.split('_')] new_id = "%s/%s/%d_%d" % (m, hn, s+trim_len, e-trim_len) f.writeRecord(new_id, rec.sequence[trim_len:-trim_len]) except ValueError: raise ValueError("%s is not a valid pacbio subread." % rec.name) except KeyError: if ignore_keyerror: logging.warning("Ignoring {zmw} because the input FASTA/BAM ". format(zmw=zmw) + " does not contain it.") else: raise ValueError("{0} doesn't exist. Abort!".format(zmw)) f.close() return movies