def __init__(self, hdf_group, pe=False, strand=False, allele=False): super(RnaseqTrack, self).__init__(hdf_group) h5file = self._get_hdf_file() if JUNCTION_GROUP not in self.hdf_group: junc_group = h5file.createGroup(self.hdf_group, JUNCTION_GROUP) else: junc_group = self.hdf_group._f_getChild(JUNCTION_GROUP) self.junc_track = IntervalTrack(junc_group, junction_dtype, junction_expectedrows) if COVERAGE_GROUP not in self.hdf_group: cov_group = h5file.createGroup(self.hdf_group, COVERAGE_GROUP) else: cov_group = self.hdf_group._f_getChild(COVERAGE_GROUP) self.cov_track = VectorTrack(cov_group, pe=pe, strand=strand, allele=allele)
class RnaseqTrack(Track): '''contains both genomic coverage data (array track) and splice junction data (interval track) ''' def __init__(self, hdf_group, pe=False, strand=False, allele=False): super(RnaseqTrack, self).__init__(hdf_group) h5file = self._get_hdf_file() if JUNCTION_GROUP not in self.hdf_group: junc_group = h5file.createGroup(self.hdf_group, JUNCTION_GROUP) else: junc_group = self.hdf_group._f_getChild(JUNCTION_GROUP) self.junc_track = IntervalTrack(junc_group, junction_dtype, junction_expectedrows) if COVERAGE_GROUP not in self.hdf_group: cov_group = h5file.createGroup(self.hdf_group, COVERAGE_GROUP) else: cov_group = self.hdf_group._f_getChild(COVERAGE_GROUP) self.cov_track = VectorTrack(cov_group, pe=pe, strand=strand, allele=allele) def get_junction_track(self): junc_group = self.hdf_group._f_getChild(JUNCTION_GROUP) return IntervalTrack(junc_group) def get_coverage_track(self): cov_group = self.hdf_group._f_getChild(COVERAGE_GROUP) return VectorTrack(cov_group) def fromtophat(self, accepted_hits_bam, junctions_bed, max_multimaps=None, flip_read2_strand=True): # insert splice junction track track_name = self.hdf_group._v_name rnames = set(self.get_rnames()) logging.info("[RnaseqTrack] adding junctions") junc_iter = tophat_bed_to_juncs(track_name, open(junctions_bed)) for junc in junc_iter: if junc[REF_COL_NAME] not in rnames: logging.debug('Skipping junc %s' % str(junc)) else: junc['id'] = self.junc_track.num_intervals self.junc_track.add(junc) self.junc_track.index(persist=True) # insert coverage track logging.info("creating coverage track") bamfh = pysam.Samfile(accepted_hits_bam, "rb") #cmdline = bamfh.header["PG"][0]["CL"] #re.search(r'--max-multihits(?:\s+|=)(\d+)', cmdline) intervalcoviter = BamCoverageIterator(bamfh, norm_rlen=True, num_hits_tag="NH", hit_prob_tag=None, max_multimaps=None, keep_dup=True, keep_qcfail=False, flip_read2_strand=True) self.cov_track.fromintervals(intervalcoviter) # store coverage statistics to allow normalization calculations stats = intervalcoviter.stats logging.debug("\tProcessed '%d' valid reads" % (stats.num_reads)) logging.debug("\tTotal coverage '%f'" % (stats.total_cov)) bamfh.close()