class GTFModel(Model): def __init__(self, fname): self.fname = fname Model.__init__(self) # test for what attributes we need to return self.gtf = GTF(self.fname) gene_gen = self.gtf.genes gene = gene_gen.next() self.has_isoform = 'isoform_id' in gene.attributes self.has_biotype = 'gene_biotype' in gene.attributes def get_source(self): return self.fname def get_name(self): return 'gtf' def get_headers(self): out = [ 'gene_id', 'gene_name', ] if self.has_isoform: out.append('isoform_id') if self.has_biotype: out.append('gene_biotype') out.extend('chrom strand txstart txend'.split()) return out def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] # just include all regions - don't worry about transcripts and exons # the regions encompass all exons anyway... for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) out = [ gene.gene_id, gene.gene_name, ] if self.has_isoform: out.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') out.extend([gene.chrom, gene.strand, gene.start, gene.end]) yield (gene.chrom, starts, ends, gene.strand, out, None) eta.done()
class GTFModel(Model): def __init__(self, fname): self.fname = fname Model.__init__(self) # test for what attributes we need to return self.gtf = GTF(self.fname) gene_gen = self.gtf.genes gene = gene_gen.next() self.has_isoform = 'isoform_id' in gene.attributes self.has_biotype = 'gene_biotype' in gene.attributes def get_source(self): return self.fname def get_name(self): return 'gtf' def get_headers(self): out = ['gene_id', 'gene_name', ] if self.has_isoform: out.append('isoform_id') if self.has_biotype: out.append('gene_biotype') out.extend('chrom strand txstart txend'.split()) return out def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] # just include all regions - don't worry about transcripts and exons # the regions encompass all exons anyway... for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) out = [gene.gene_id, gene.gene_name, ] if self.has_isoform: out.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') out.extend([gene.chrom, gene.strand, gene.start, gene.end]) yield (gene.chrom, starts, ends, gene.strand, out, None) eta.done()
class ExonModel(Model): def __init__(self, fname): self.fname = fname Model.__init__(self) self.gtf = GTF(self.fname) gene_gen = self.gtf.genes gene = gene_gen.next() self.has_isoform = 'isoform_id' in gene.attributes self.has_biotype = 'gene_biotype' in gene.attributes def get_source(self): return self.fname def get_name(self): return 'exon' def get_headers(self): out = [ 'gene_id', 'gene_name', ] if self.has_isoform: out.append('isoform_id') if self.has_biotype: out.append('gene_biotype') out.extend('chrom strand txstart txend'.split()) return out def get_postheaders(self): return 'regionstart regionend const_count region_num const_alt count excl_count incl_pct excl_pct alt-index'.split( ) def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] const_spans = [] geneout = [ gene.gene_id, gene.gene_name, ] if self.has_isoform: geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: geneout.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') geneout.extend([gene.chrom, gene.strand, gene.start, gene.end]) was_last_const = False for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) # assemble a list of lists with contiguous spans of constant regions # this will let us count junction-spanning reads that are cover two # constant exons/regions if const: if not was_last_const: const_spans.append([]) const_spans[-1].append((start, end)) was_last_const = True else: was_last_const = False def callback(bam, common_count, common_reads, common_cols): # gather constant reads const_count = 0 for span in const_spans: starts = [] ends = [] for start, end in span: starts.append(start) ends.append(end) count, reads = _fetch_reads( bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) const_count += count #find counts for each region for num, start, end, const, names in gene.regions: count, reads = _fetch_reads( bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) excl_count, excl_reads = _fetch_reads_excluding( bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type) # remove reads that exclude this region for read in excl_reads: if read in reads: reads.remove(read) count = count - 1 # find reads that *arent'* in this region other_reads = 0 for read in common_reads: if not read in reads and not read in excl_reads: other_reads += 1 if other_reads > 0: altindex = float(count - excl_count) / other_reads else: altindex = '' if len(common_reads) > 0: incl_pct = float(count) / len(common_reads) excl_pct = float(excl_count) / len(common_reads) else: incl_pct = '' excl_pct = '' cols = common_cols[:] cols.append(start) cols.append(end) cols.append(const_count) cols.append(num) cols.append('const' if const else 'alt') cols.append(count) cols.append(excl_count) cols.append(incl_pct) cols.append(excl_pct) cols.append(altindex) yield cols yield (gene.chrom, starts, ends, gene.strand, geneout, callback) eta.done() def count(self, bam, library_type, coverage=False, uniq_only=False, fpkm=False, norm='', multiple='complete', whitelist=None, blacklist=None, out=sys.stdout, quiet=False, start_only=False): self.uniq_only = uniq_only self.multiple = multiple self.whitelist = whitelist self.blacklist = blacklist self.library_type = library_type self.stranded = library_type in ['FR', 'RF'] Model.count(self, bam, library_type, coverage, uniq_only, fpkm, norm, multiple, whitelist, blacklist, out, quiet, start_only)
class ExonModel(Model): def __init__(self, fname): self.fname = fname Model.__init__(self) self.gtf = GTF(self.fname) gene_gen = self.gtf.genes gene = gene_gen.next() self.has_isoform = 'isoform_id' in gene.attributes self.has_biotype = 'gene_biotype' in gene.attributes def get_source(self): return self.fname def get_name(self): return 'exon' def get_headers(self): out = ['gene_id', 'gene_name', ] if self.has_isoform: out.append('isoform_id') if self.has_biotype: out.append('gene_biotype') out.extend('chrom strand txstart txend'.split()) return out def get_postheaders(self): return 'regionstart regionend const_count region_num const_alt count excl_count incl_pct excl_pct alt-index'.split() def get_regions(self): eta = ETA(self.gtf.fsize(), fileobj=self.gtf) for gene in self.gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] const_spans = [] geneout = [gene.gene_id, gene.gene_name, ] if self.has_isoform: geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '') if self.has_biotype: geneout.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '') geneout.extend([gene.chrom, gene.strand, gene.start, gene.end]) was_last_const = False for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) # assemble a list of lists with contiguous spans of constant regions # this will let us count junction-spanning reads that are cover two # constant exons/regions if const: if not was_last_const: const_spans.append([]) const_spans[-1].append((start, end)) was_last_const = True else: was_last_const = False def callback(bam, common_count, common_reads, common_cols): # gather constant reads const_count = 0 for span in const_spans: starts = [] ends = [] for start, end in span: starts.append(start) ends.append(end) count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) const_count += count #find counts for each region for num, start, end, const, names in gene.regions: count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type) # remove reads that exclude this region for read in excl_reads: if read in reads: reads.remove(read) count = count - 1 # find reads that *arent'* in this region other_reads = 0 for read in common_reads: if not read in reads and not read in excl_reads: other_reads += 1 if other_reads > 0: altindex = float(count - excl_count) / other_reads else: altindex = '' if len(common_reads) > 0: incl_pct = float(count) / len(common_reads) excl_pct = float(excl_count) / len(common_reads) else: incl_pct = '' excl_pct = '' cols = common_cols[:] cols.append(start) cols.append(end) cols.append(const_count) cols.append(num) cols.append('const' if const else 'alt') cols.append(count) cols.append(excl_count) cols.append(incl_pct) cols.append(excl_pct) cols.append(altindex) yield cols yield (gene.chrom, starts, ends, gene.strand, geneout, callback) eta.done() def count(self, bam, library_type, coverage=False, uniq_only=False, fpkm=False, norm='', multiple='complete', whitelist=None, blacklist=None, out=sys.stdout, quiet=False, start_only=False): self.uniq_only = uniq_only self.multiple = multiple self.whitelist = whitelist self.blacklist = blacklist self.library_type = library_type self.stranded = library_type in ['FR', 'RF'] Model.count(self, bam, library_type, coverage, uniq_only, fpkm, norm, multiple, whitelist, blacklist, out, quiet, start_only)
def get_regions(self): gtf = GTF(self.fname) eta = ETA(gtf.fsize(), fileobj=gtf) for gene in gtf.genes: eta.print_status(extra=gene.gene_name) starts = [] ends = [] const_spans = [] was_last_const = False for num, start, end, const, names in gene.regions: starts.append(start) ends.append(end) # assemble a list of lists with contiguous spans of constant regions # this will let us count junction-spanning reads that are cover two # constant exons/regions if const: if not was_last_const: const_spans.append([]) const_spans[-1].append((start, end)) was_last_const = True else: was_last_const = False def callback(bam, common_count, common_reads, common_cols): # gather constant reads const_count = 0 for span in const_spans: starts = [] ends = [] for start, end in span: starts.append(start) ends.append(end) count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) const_count += count #find counts for each region for num, start, end, const, names in gene.regions: count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type) excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type) # remove reads that exclude this region for read in excl_reads: if read in reads: reads.remove(read) count = count - 1 # find reads that *arent'* in this region other_reads = 0 for read in common_reads: if not read in reads and not read in excl_reads: other_reads += 1 if other_reads > 0: altindex = float(count - excl_count) / other_reads else: altindex = '' if len(common_reads) > 0: incl_pct = float(count) / len(common_reads) excl_pct = float(excl_count) / len(common_reads) else: incl_pct = '' excl_pct = '' cols = common_cols[:] cols.append(start) cols.append(end) cols.append(const_count) cols.append(num) cols.append('const' if const else 'alt') cols.append(count) cols.append(excl_count) cols.append(incl_pct) cols.append(excl_pct) cols.append(altindex) yield cols yield (gene.chrom, starts, ends, gene.strand, [gene.gene_name, gene.gene_id, gene.isoform_id, gene.chrom, gene.strand, gene.start, gene.end], callback) eta.done()