コード例 #1
0
ファイル: stats.py プロジェクト: xuwei684/ngsutils
    def __init__(self, bed, gtf_file=None, names=False):
        self.regiontagger = None
        if gtf_file:
            self.regiontagger = RegionTagger(gtf_file)

        self.total = 0
        self.size = 0
        self.lengths = Counts()
        self.refs = {}
        self.names = {}
        for region in bed:
            self.total += 1
            self.size += (region.end - region.start)
            self.lengths.add(region.end - region.start)

            if names:
                if not region.name in self.names:
                    self.names[region.name] = 1
                else:
                    self.names[region.name] += 1

            if not region.chrom in self.refs:
                self.refs[region.chrom] = 0
            self.refs[region.chrom] += 1

            if self.regiontagger:
                self.regiontagger.add_region(region.chrom, region.start,
                                             region.end, region.strand)
コード例 #2
0
    def __init__(self, bamfile, gtf=None, region=None, delim=None, tags=[], show_all=False):
        regiontagger = None
        flag_counts = FlagCounts()

        ref = None
        start = None
        end = None

        if gtf:
            regiontagger = RegionTagger(gtf, bamfile.references, only_first_fragment=True)

        if region:
            ref, startend = region.rsplit(':', 1)
            if '-' in startend:
                start, end = [int(x) for x in startend.split('-')]
                start = start - 1
                sys.stderr.write('Region: %s:%s-%s\n' % (ref, start + 1, end))
            else:
                start = int(startend) - 1
                end = int(startend)
                sys.stderr.write('Region: %s:%s\n' % (ref, start + 1))

        total = 0
        mapped = 0
        unmapped = 0
        
        tlen_counts = {}

        names = set()
        refs = {}

        tagbins = {}
        for tag in tags:
            tagbins[tag] = FeatureBin(tag)

        for rname in bamfile.references:
            if delim:
                refs[rname.split(delim)[0]] = 0
            else:
                refs[rname] = 0

        # setup region or whole-file readers
        def _foo1():
            for read in bamfile.fetch(ref, start, end):
                yield read

        def _foo2():
            for read in bam_iter(bamfile):
                yield read

        if region:
            read_gen = _foo1
        else:
            read_gen = _foo2

        has_ih = True
        has_nh = True

        try:
            for read in read_gen():
                if not show_all and read.is_paired and not read.is_read1:
                    # only operate on the first fragment
                    continue

                try:
                    if has_ih and read.opt('IH') > 1:
                        if read.qname in names:
                            # reads only count once for this...
                            continue
                        names.add(read.qname)
                except KeyError:
                    if not read.is_unmapped:
                        has_ih = False
                    #missing IH tag - ignore
                    pass

                try:
                    if has_nh and read.opt('NH') > 1:
                        if read.qname in names:
                            # reads only count once for this...
                            continue
                        names.add(read.qname)
                except KeyError:
                    if not read.is_unmapped:
                        has_nh = False
                    #missing NH tag - ignore
                    pass

                flag_counts.add(read.flag)

                total += 1
                if read.is_unmapped:
                    unmapped += 1
                    continue

                mapped += 1

                if read.is_proper_pair and read.tid == read.mrnm:
                    # we don't care about reads that don't map to the same reference

                    # note: this doesn't work for RNA mapped to a reference genome...
                    # for RNA, you'd need to map to a transcript library (refseq) to get
                    # an accurate template length
                    #
                    # just skipping 'N' cigar values won't cut it either... since the pairs
                    # will likely silently span a gap.

                    if read.is_reverse:
                        k = -read.tlen
                    else:
                        k = read.tlen

                    if not k in tlen_counts:
                        tlen_counts[k] = 1
                    else:
                        tlen_counts[k] += 1

                if delim:
                    refs[bamfile.getrname(read.rname).split(delim)[0]] += 1
                else:
                    refs[bamfile.getrname(read.rname)] += 1

                if regiontagger:
                    regiontagger.add_read(read, bamfile.getrname(read.rname))

                for tag in tagbins:
                    tagbins[tag].add(read)

        except KeyboardInterrupt:
            sys.stderr.write('*** Interrupted - displaying stats up to this point! ***\n\n')

        self.total = total
        self.mapped = mapped
        self.unmapped = unmapped
        self.flag_counts = flag_counts
        self.tagbins = tagbins
        self.refs = refs
        self.regiontagger = regiontagger
        self.tlen_counts = tlen_counts