コード例 #1
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes
コード例 #2
0
    def testGTFIso(self):
        src = StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1201|1300|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1001|1100|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual(len(genes), 1)
        self.assertEqual(['foo1'], [g.gene_id for g in genes])
        self.assertEqual(['iso1'], [
            g.gid for g in genes
        ])  # gid is the actual unique id (should be constant for isoforms)
        transcripts = list(genes[0].transcripts)
        self.assertEqual(['bar1', 'bar2'],
                         [t.transcript_id for t in transcripts])
        self.assertEqual(len(transcripts), 2)
        self.assertEqual(list(genes[0].regions),
                         [(1, 1000, 1100, True, 'bar1,bar2'),
                          (2, 1200, 1300, False, 'bar1'),
                          (3, 1400, 1500, True, 'bar1,bar2')])
コード例 #3
0
    def testFind(self):
        src = StringIO.StringIO('''\
chr1|test|gene|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1";
chr1|test|gene|2001|2100|0|+|.|gene_id "foo2"; transcript_id "bar2";
chr1|test|gene|3001|3100|0|+|.|gene_id "foo3"; transcript_id "bar3";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual(['foo1', 'foo2', 'foo3'], [x.gene_id for x in genes])

        found = False
        for gene in gtf.find('chr1', 1900, 2200):
            found = True
            self.assertEqual(gene.gid, 'foo2')
        self.assertTrue(found)

        found = False
        for gene in gtf.find('chr1', 1900, 2000):
            found = True
            self.assertEqual(gene.gid, 'foo2')
        self.assertTrue(found)

        found = False
        for gene in gtf.find('chr1', 2050, 2200):
            found = True
            self.assertEqual(gene.gid, 'foo2')
        self.assertTrue(found)
コード例 #4
0
    def testJunctionsIsoforms(self):
        gtf = GTF(fileobj=StringIO.StringIO('''\
test1|test|exon|10|20|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
test1|test|exon|30|40|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
test1|test|exon|90|100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
test1|test|exon|10|20|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1"
test1|test|exon|50|70|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1"
test1|test|exon|90|100|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1"
'''.replace('|', '\t')),
                  quiet=True)

        valid = '''\
>test1:16-20,29-33
ATGCGCGC
>test1:16-20,49-53
ATGCCTGA
>test1:16-20,89-93
ATGCTCGA
>test1:36-40,49-53
GATCCTGA
>test1:36-40,89-93
GATCTCGA
>test1:66-70,89-93
ATCGTCGA
'''
        out = StringIO.StringIO('')
        ngsutils.gtf.junctions.gtf_junctions(gtf,
                                             fa,
                                             fragment_size=4,
                                             min_size=8,
                                             out=out,
                                             quiet=True)
        self.assertEqual(out.getvalue(), valid)
コード例 #5
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
class GTFModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        # test for what attributes we need to return
        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes

    def get_source(self):
        return self.fname

    def get_name(self):
        return 'gtf'

    def get_headers(self):
        out = [
            'gene_id',
            'gene_name',
        ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())
        return out

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []

            # just include all regions - don't worry about transcripts and exons
            # the regions encompass all exons anyway...
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

            out = [
                gene.gene_id,
                gene.gene_name,
            ]
            if self.has_isoform:
                out.append(gene.attributes['isoform_id'] if 'isoform_id' in
                           gene.attributes else '')
            if self.has_biotype:
                out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in
                           gene.attributes else '')
            out.extend([gene.chrom, gene.strand, gene.start, gene.end])

            yield (gene.chrom, starts, ends, gene.strand, out, None)
        eta.done()
コード例 #6
0
ファイル: models.py プロジェクト: ZhangQiuxue/ngsutils
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes
コード例 #7
0
ファイル: models.py プロジェクト: ZhangQiuxue/ngsutils
class GTFModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        # test for what attributes we need to return
        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes


    def get_source(self):
        return self.fname

    def get_name(self):
        return 'gtf'

    def get_headers(self):
        out = ['gene_id', 'gene_name', ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())
        return out

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []

            # just include all regions - don't worry about transcripts and exons
            # the regions encompass all exons anyway...
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

            out = [gene.gene_id, gene.gene_name, ]
            if self.has_isoform:
                out.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '')
            if self.has_biotype:
                out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '')
            out.extend([gene.chrom, gene.strand, gene.start, gene.end])

            yield (gene.chrom, starts, ends, gene.strand, out, None)
        eta.done()
コード例 #8
0
    def testGTF(self):
        src = StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar";
chr1|test|exon|1201|1300|0|+|.|gene_id "foo"; transcript_id "bar";
chr1|test|CDS|1051|1247|0|+|1|gene_id "foo"; transcript_id "bar";
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar";
chr1|test|stop_codon|1248|1250|0|+|.|gene_id "foo"; transcript_id "bar";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        for gene in gtf.genes:
            self.assertEqual(gene.gene_id, 'foo')
            for transcript in gene.transcripts:
                self.assertEqual(transcript.transcript_id, 'bar')
                self.assertEqual(list(transcript.exons), [(1000, 1100),
                                                          (1200, 1300)])
コード例 #9
0
    def testGeneOnly(self):
        src = StringIO.StringIO('''\
chr1|test|gene|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1";
chr1|test|gene|2001|2100|0|+|.|gene_id "foo2"; transcript_id "bar2";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual(['foo1', 'foo2'], [x.gene_id for x in genes])
        # gene start / end
        self.assertEqual((1000, 1100), (genes[0].start, genes[0].end))
        t = list(genes[0].transcripts)[0]
        # apply start / end to transcript and as an exon
        self.assertEqual((1000, 1100), (t.start, t.end))
        self.assertEqual((1000, 1100), (t.exons[0]))

        t = list(genes[1].transcripts)[0]
        # apply start / end to transcript and as an exon
        self.assertEqual((2000, 2100), (t.start, t.end))
        self.assertEqual((2000, 2100), (t.exons[0]))
コード例 #10
0
    def testGTFNoIso(self):
        src = StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|exon|1201|1300|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|exon|1401|1500|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo"; transcript_id "bar1";
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo"; transcript_id "bar1";
chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar2";
chr1|test|exon|1401|1500|0|+|.|gene_id "foo"; transcript_id "bar2";
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo"; transcript_id "bar2";
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar2";
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo"; transcript_id "bar2";
'''.replace('|', '\t'))

        gtf = GTF(fileobj=src, quiet=True)
        genes = list(gtf.genes)
        self.assertEqual('foo', genes[0].gene_id)
        transcripts = list(genes[0].transcripts)
        self.assertEqual(len(transcripts), 2)
        self.assertEqual(list(genes[0].regions),
                         [(1, 1000, 1100, True, 'bar1,bar2'),
                          (2, 1200, 1300, False, 'bar1'),
                          (3, 1400, 1500, True, 'bar1,bar2')])
コード例 #11
0
        elif arg in ['-frag', '-min']:
            last = arg
        elif arg == '-scramble':
            scramble = True
        elif arg == '-retain-introns':
            retain_introns = True
        elif arg == '-known':
            known = True
        elif arg == '-h':
            usage()
        elif gtf is None and os.path.exists(arg):
            gtf = arg
        elif fasta is None and os.path.exists(arg):
            if not os.path.exists('%s.fai' % arg):
                usage('Missing .fai FASTA index for file: %s' % arg)
            fasta = arg

    if not gtf or not fasta:
        usage()

    if known and scramble:
        usage("You can not use both -known and -scramble at the same time!")

    gtf_junctions(GTF(gtf),
                  fasta,
                  frag_size,
                  min_size,
                  known=known,
                  scramble=scramble,
                  retain_introns=retain_introns)
コード例 #12
0
ファイル: annotate.py プロジェクト: xuwei684/ngsutils
            last = None
        elif arg == '-noheader':
            header = False
        elif arg == '-gene_id':
            gene_id = True
        elif arg == '-transcript_id':
            transcript_id = True
        elif arg == '-gene_name':
            gene_name = True
        elif arg == '-gene_location':
            gene_location = True
        elif arg in ['-ref', '-pos']:
            last = arg
        elif not gtffile and os.path.exists(arg):
            gtffile = arg
        elif not infile and os.path.exists(arg):
            infile = arg
        else:
            print 'Unknown argument: %s' % arg

    if not gtffile:
        usage('Missing GTF file')
    if not infile:
        usage('Missing input file')
    if not (gene_name or gene_location or gene_id or transcript_id):
        usage('Missing outputs - nothing to annotate')

    gtf = GTF(gtffile)
    gtf_annotate(gtf, infile, ref_col, pos_col, gene_name, gene_location,
                 gene_id, transcript_id, header)
コード例 #13
0
            maxtx = max(size, maxtx)
            maxintron = max(intron_size, maxintron)
            maxcoding = max(cds, maxcoding)
            maxutr5 = max(utr5, maxutr5)
            maxutr3 = max(utr3, maxutr3)

        cols.append(maxtx)
        cols.append(maxintron)
        cols.append(maxcoding)
        cols.append(maxutr5)
        cols.append(maxutr3)

        out.write('%s\n' % '\t'.join([str(x) for x in cols]))


if __name__ == '__main__':
    fname = None
    for arg in sys.argv[1:]:
        if arg == '-h':
            usage()
        if not fname and os.path.exists(arg):
            fname = arg
        else:
            usage()

    if not fname:
        usage()

    gtf = GTF(fname)
    gtf_genesize(gtf)
コード例 #14
0
ファイル: tobed.py プロジェクト: xuwei684/ngsutils
            genes, exons, introns, regions, tss, tlss, txs, tlxs, utr_5, utr_3,
            junc_5, junc_3, promoter
    ]:
        if arg:
            i += 1

    if i == 0:
        usage('You must select one [type] to export.')
    elif i > 1:
        usage('You must select *only one* [type] to export.')
    elif not filename:
        usage('Missing input file')
    elif promoter and not (promoter_down or promoter_up):
        usage('You must specify a valid promoter length!')

    gtf = GTF(filename)

    if genes:
        gtf_genes_tobed(gtf)
    elif exons:
        gtf_exons_tobed(gtf)
    elif introns:
        gtf_introns_tobed(gtf)
    elif regions:
        gtf_regions_tobed(gtf)
    elif tss:
        gtf_tss_tobed(gtf)
    elif tlss:
        gtf_tlss_tobed(gtf)
    elif txs:
        gtf_txs_tobed(gtf)
コード例 #15
0
 def testQuery(self):
     genes = list(ngsutils.gtf.query.gtf_query(GTF(fname, cache_enabled=False), 'chr1', 1000, 2000))
     self.assertEquals(str(genes[0]), 'foo1(iso1) chr1:1000-2500[+]')
コード例 #16
0
import unittest
import StringIO

import ngsutils.gtf.genesize
from ngsutils.gtf import GTF

gtf = GTF(fileobj=StringIO.StringIO('''\
chr1|test|exon|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1201|1300|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1"
chr1|test|exon|1001|1100|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|exon|1401|1500|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|CDS|1051|1447|0|+|1|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1"
chr1|test|exon|2001|2100|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|exon|2201|2300|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|exon|2401|2500|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|CDS|2051|2447|0|+|1|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|start_codon|2051|2053|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
chr1|test|stop_codon|2448|2450|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2"
'''.replace('|', '\t')),
          quiet=True)


class GTFGeneSizeTest(unittest.TestCase):
    def testGeneSize(self):
        out = StringIO.StringIO('')
        ngsutils.gtf.genesize.gtf_genesize(gtf, out=out)
コード例 #17
0
ファイル: models.py プロジェクト: erlevy/ngsutils
    def get_regions(self):
        gtf = GTF(self.fname)
        eta = ETA(gtf.fsize(), fileobj=gtf)

        for gene in gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, [gene.gene_name, gene.gene_id, gene.isoform_id, gene.chrom, gene.strand, gene.start, gene.end], callback)
        eta.done()
コード例 #18
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
class ExonModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes

    def get_source(self):
        return self.fname

    def get_name(self):
        return 'exon'

    def get_headers(self):
        out = [
            'gene_id',
            'gene_name',
        ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())

        return out

    def get_postheaders(self):
        return 'regionstart regionend const_count region_num const_alt count excl_count incl_pct excl_pct alt-index'.split(
        )

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            geneout = [
                gene.gene_id,
                gene.gene_name,
            ]
            if self.has_isoform:
                geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in
                               gene.attributes else '')
            if self.has_biotype:
                geneout.append(gene.attributes['gene_biotype']
                               if 'gene_biotype' in gene.attributes else '')
            geneout.extend([gene.chrom, gene.strand, gene.start, gene.end])

            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, starts, ends,
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, [start], [end],
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, start, end,
                        self.multiple, self.whitelist, self.blacklist,
                        self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, geneout, callback)
        eta.done()

    def count(self,
              bam,
              library_type,
              coverage=False,
              uniq_only=False,
              fpkm=False,
              norm='',
              multiple='complete',
              whitelist=None,
              blacklist=None,
              out=sys.stdout,
              quiet=False,
              start_only=False):
        self.uniq_only = uniq_only
        self.multiple = multiple
        self.whitelist = whitelist
        self.blacklist = blacklist
        self.library_type = library_type

        self.stranded = library_type in ['FR', 'RF']

        Model.count(self, bam, library_type, coverage, uniq_only, fpkm, norm,
                    multiple, whitelist, blacklist, out, quiet, start_only)
コード例 #19
0
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True):
    if gtf_file:
        gtf = GTF(gtf_file)
    else:
        gtf = None

    sys.stderr.write('Calculating Read stats...\n')

    stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles]

    sys.stdout.write('\t')
    for fname, stat in zip(infiles, stats):
        sys.stdout.write('%s\t\t' % fname)
    sys.stdout.write('\n')

    sys.stdout.write('Reads:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.total)
    sys.stdout.write('\n')

    sys.stdout.write('Mapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.mapped)
    sys.stdout.write('\n')

    sys.stdout.write('Unmapped:\t')
    for stat in stats:
        sys.stdout.write('%s\t\t' % stat.unmapped)
    sys.stdout.write('\n')

    sys.stdout.write('\nFlag distribution\n')
    validflags = set()
    maxsize = 0
    for flag in flag_descriptions:
        for stat in stats:
            if stat.flag_counts.counts[flag] > 0:
                validflags.add(flag)
                maxsize = max(maxsize, len(flag_descriptions[flag]))

    for flag in sorted(validflags):
        sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag]))
        for stat in stats:
            sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total)))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    if stats[0].tlen_counts:
        sys.stdout.write('Template length:')
        for stat in stats:
            mean, stdev = counts_mean_stdev(stat.tlen_counts)
            sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev))
        sys.stdout.write('\n')
    sys.stdout.write('\n')

    stat_tags = {}
    for tag in stats[0].tagbins:
        stat_tags[tag] = []
        for stat in stats:
            stat_tags[tag].append(stat.tagbins[tag])

    for tag in stat_tags:
        asc = stats[0].tagbins[tag].asc
        sys.stdout.write("Ave %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.mean)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write("Max %s:" % tag)
        for i, tagbin in enumerate(stat_tags[tag]):
            sys.stdout.write('\t%s' % tagbin.max)
            if i != len(stats):
                sys.stdout.write('\t')
        sys.stdout.write('\n')

        sys.stdout.write('%s distribution:\n' % tag)

        gens = []
        gen_vals = []
        last_pcts = []

        for stat in stats:
            gens.append(stat.distribution_gen(tag))
            gen_vals.append(None)
            last_pcts.append(0.0)

        good = True

        last = None

        while good:
            good = False
            for i, stat in enumerate(stats):
                if not gen_vals[i]:
                    try:
                        gen_vals[i] = gens[i].next()
                    except StopIteration:
                        pass
            vals = [tup[0] for tup in gen_vals if tup]
            if not vals:
                continue
            if asc:
                minval = min(vals)
            else:
                minval = max(vals)

            if last and type(last) == int and fillin_stats:
                if asc:
                    last += 1
                    # fill in missing values
                    while last < minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last += 1
                else:
                    last -= 1
                    # fill in missing values
                    while last > minval:
                        sys.stdout.write('%s' % last)
                        for i, stat in enumerate(stats):
                            sys.stdout.write('\t0\t%s' % last_pcts[i])
                        sys.stdout.write('\n')
                        last -= 1

            last = minval
            sys.stdout.write(str(minval))

            for i, tup in enumerate(gen_vals):
                if tup and tup[0] == minval:
                    sys.stdout.write('\t%s\t%s' % (tup[1], tup[2]))
                    last_pcts[i] = tup[2]
                    gen_vals[i] = None
                    good = True
                else:
                    sys.stdout.write('\t0\t%s' % (last_pcts[i]))
            sys.stdout.write('\n')
        sys.stdout.write('\n')

    sys.stdout.write('Reference counts')
    for stat in stats:
        sys.stdout.write('\tcount\t')
    sys.stdout.write('\n')
    for k in sorted([x for x in stats[0].refs]):
        sys.stdout.write('%s' % k)
        for stat in stats:
            sys.stdout.write('\t%s\t' % stat.refs[k])
        sys.stdout.write('\n')

    if gtf_file:
        sys.stdout.write('Mapping regions')
        for stat in stats:
            sys.stdout.write('\tcount\tCPM')
        sys.stdout.write('\n')
        sorted_keys = [x for x in stats[0].regiontagger.counts]
        sorted_keys.sort()
        for k in sorted_keys:
            sys.stdout.write('%s' % k)
            for stat in stats:
                sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000))
            sys.stdout.write('\n')
コード例 #20
0
ファイル: models.py プロジェクト: ZhangQiuxue/ngsutils
class ExonModel(Model):
    def __init__(self, fname):
        self.fname = fname
        Model.__init__(self)

        self.gtf = GTF(self.fname)
        gene_gen = self.gtf.genes
        gene = gene_gen.next()

        self.has_isoform = 'isoform_id' in gene.attributes
        self.has_biotype = 'gene_biotype' in gene.attributes


    def get_source(self):
        return self.fname

    def get_name(self):
        return 'exon'

    def get_headers(self):
        out = ['gene_id', 'gene_name', ]
        if self.has_isoform:
            out.append('isoform_id')
        if self.has_biotype:
            out.append('gene_biotype')
        out.extend('chrom strand txstart txend'.split())

        return out

    def get_postheaders(self):
        return 'regionstart regionend const_count region_num const_alt count excl_count incl_pct excl_pct alt-index'.split()

    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            geneout = [gene.gene_id, gene.gene_name, ]
            if self.has_isoform:
                geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '')
            if self.has_biotype:
                geneout.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '')
            geneout.extend([gene.chrom, gene.strand, gene.start, gene.end])


            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, geneout, callback)
        eta.done()

    def count(self, bam, library_type, coverage=False, uniq_only=False, fpkm=False, norm='', multiple='complete', whitelist=None, blacklist=None, out=sys.stdout, quiet=False, start_only=False):
        self.uniq_only = uniq_only
        self.multiple = multiple
        self.whitelist = whitelist
        self.blacklist = blacklist
        self.library_type = library_type

        self.stranded = library_type in ['FR', 'RF']

        Model.count(self, bam, library_type, coverage, uniq_only, fpkm, norm, multiple, whitelist, blacklist, out, quiet, start_only)