def testFind(self): src = StringIO.StringIO('''\ chr1|test|gene|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; chr1|test|gene|2001|2100|0|+|.|gene_id "foo2"; transcript_id "bar2"; chr1|test|gene|3001|3100|0|+|.|gene_id "foo3"; transcript_id "bar3"; '''.replace('|', '\t')) gtf = GTF(fileobj=src, quiet=True) genes = list(gtf.genes) self.assertEqual(['foo1', 'foo2', 'foo3'], [x.gene_id for x in genes]) found = False for gene in gtf.find('chr1', 1900, 2200): found = True self.assertEqual(gene.gid, 'foo2') self.assertTrue(found) found = False for gene in gtf.find('chr1', 1900, 2000): found = True self.assertEqual(gene.gid, 'foo2') self.assertTrue(found) found = False for gene in gtf.find('chr1', 2050, 2200): found = True self.assertEqual(gene.gid, 'foo2') self.assertTrue(found)
def testGTFIso(self): src = StringIO.StringIO('''\ chr1|test|exon|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|exon|1201|1300|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|exon|1401|1500|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|CDS|1051|1447|0|+|1|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|exon|1001|1100|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|exon|1401|1500|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|CDS|1051|1447|0|+|1|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" '''.replace('|', '\t')) gtf = GTF(fileobj=src, quiet=True) genes = list(gtf.genes) self.assertEqual(len(genes), 1) self.assertEqual(['foo1'], [g.gene_id for g in genes]) self.assertEqual(['iso1'], [ g.gid for g in genes ]) # gid is the actual unique id (should be constant for isoforms) transcripts = list(genes[0].transcripts) self.assertEqual(['bar1', 'bar2'], [t.transcript_id for t in transcripts]) self.assertEqual(len(transcripts), 2) self.assertEqual(list(genes[0].regions), [(1, 1000, 1100, True, 'bar1,bar2'), (2, 1200, 1300, False, 'bar1'), (3, 1400, 1500, True, 'bar1,bar2')])
def testJunctionsIsoforms(self): gtf = GTF(fileobj=StringIO.StringIO('''\ test1|test|exon|10|20|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" test1|test|exon|30|40|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" test1|test|exon|90|100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" test1|test|exon|10|20|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1" test1|test|exon|50|70|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1" test1|test|exon|90|100|0|+|.|gene_id "foo1"; transcript_id "bar2"; isoform_id "iso1" '''.replace('|', '\t')), quiet=True) valid = '''\ >test1:16-20,29-33 ATGCGCGC >test1:16-20,49-53 ATGCCTGA >test1:16-20,89-93 ATGCTCGA >test1:36-40,49-53 GATCCTGA >test1:36-40,89-93 GATCTCGA >test1:66-70,89-93 ATCGTCGA ''' out = StringIO.StringIO('') ngsutils.gtf.junctions.gtf_junctions(gtf, fa, fragment_size=4, min_size=8, out=out, quiet=True) self.assertEqual(out.getvalue(), valid)
def __init__(self, fname): self.fname = fname Model.__init__(self) self.gtf = GTF(self.fname) gene_gen = self.gtf.genes gene = gene_gen.next() self.has_isoform = 'isoform_id' in gene.attributes self.has_biotype = 'gene_biotype' in gene.attributes
def testGTF(self): src = StringIO.StringIO('''\ chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar"; chr1|test|exon|1201|1300|0|+|.|gene_id "foo"; transcript_id "bar"; chr1|test|CDS|1051|1247|0|+|1|gene_id "foo"; transcript_id "bar"; chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar"; chr1|test|stop_codon|1248|1250|0|+|.|gene_id "foo"; transcript_id "bar"; '''.replace('|', '\t')) gtf = GTF(fileobj=src, quiet=True) for gene in gtf.genes: self.assertEqual(gene.gene_id, 'foo') for transcript in gene.transcripts: self.assertEqual(transcript.transcript_id, 'bar') self.assertEqual(list(transcript.exons), [(1000, 1100), (1200, 1300)])
def testGeneOnly(self): src = StringIO.StringIO('''\ chr1|test|gene|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; chr1|test|gene|2001|2100|0|+|.|gene_id "foo2"; transcript_id "bar2"; '''.replace('|', '\t')) gtf = GTF(fileobj=src, quiet=True) genes = list(gtf.genes) self.assertEqual(['foo1', 'foo2'], [x.gene_id for x in genes]) # gene start / end self.assertEqual((1000, 1100), (genes[0].start, genes[0].end)) t = list(genes[0].transcripts)[0] # apply start / end to transcript and as an exon self.assertEqual((1000, 1100), (t.start, t.end)) self.assertEqual((1000, 1100), (t.exons[0])) t = list(genes[1].transcripts)[0] # apply start / end to transcript and as an exon self.assertEqual((2000, 2100), (t.start, t.end)) self.assertEqual((2000, 2100), (t.exons[0]))
def testGTFNoIso(self): src = StringIO.StringIO('''\ chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar1"; chr1|test|exon|1201|1300|0|+|.|gene_id "foo"; transcript_id "bar1"; chr1|test|exon|1401|1500|0|+|.|gene_id "foo"; transcript_id "bar1"; chr1|test|CDS|1051|1447|0|+|1|gene_id "foo"; transcript_id "bar1"; chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar1"; chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo"; transcript_id "bar1"; chr1|test|exon|1001|1100|0|+|.|gene_id "foo"; transcript_id "bar2"; chr1|test|exon|1401|1500|0|+|.|gene_id "foo"; transcript_id "bar2"; chr1|test|CDS|1051|1447|0|+|1|gene_id "foo"; transcript_id "bar2"; chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo"; transcript_id "bar2"; chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo"; transcript_id "bar2"; '''.replace('|', '\t')) gtf = GTF(fileobj=src, quiet=True) genes = list(gtf.genes) self.assertEqual('foo', genes[0].gene_id) transcripts = list(genes[0].transcripts) self.assertEqual(len(transcripts), 2) self.assertEqual(list(genes[0].regions), [(1, 1000, 1100, True, 'bar1,bar2'), (2, 1200, 1300, False, 'bar1'), (3, 1400, 1500, True, 'bar1,bar2')])
elif arg in ['-frag', '-min']: last = arg elif arg == '-scramble': scramble = True elif arg == '-retain-introns': retain_introns = True elif arg == '-known': known = True elif arg == '-h': usage() elif gtf is None and os.path.exists(arg): gtf = arg elif fasta is None and os.path.exists(arg): if not os.path.exists('%s.fai' % arg): usage('Missing .fai FASTA index for file: %s' % arg) fasta = arg if not gtf or not fasta: usage() if known and scramble: usage("You can not use both -known and -scramble at the same time!") gtf_junctions(GTF(gtf), fasta, frag_size, min_size, known=known, scramble=scramble, retain_introns=retain_introns)
last = None elif arg == '-noheader': header = False elif arg == '-gene_id': gene_id = True elif arg == '-transcript_id': transcript_id = True elif arg == '-gene_name': gene_name = True elif arg == '-gene_location': gene_location = True elif arg in ['-ref', '-pos']: last = arg elif not gtffile and os.path.exists(arg): gtffile = arg elif not infile and os.path.exists(arg): infile = arg else: print 'Unknown argument: %s' % arg if not gtffile: usage('Missing GTF file') if not infile: usage('Missing input file') if not (gene_name or gene_location or gene_id or transcript_id): usage('Missing outputs - nothing to annotate') gtf = GTF(gtffile) gtf_annotate(gtf, infile, ref_col, pos_col, gene_name, gene_location, gene_id, transcript_id, header)
maxtx = max(size, maxtx) maxintron = max(intron_size, maxintron) maxcoding = max(cds, maxcoding) maxutr5 = max(utr5, maxutr5) maxutr3 = max(utr3, maxutr3) cols.append(maxtx) cols.append(maxintron) cols.append(maxcoding) cols.append(maxutr5) cols.append(maxutr3) out.write('%s\n' % '\t'.join([str(x) for x in cols])) if __name__ == '__main__': fname = None for arg in sys.argv[1:]: if arg == '-h': usage() if not fname and os.path.exists(arg): fname = arg else: usage() if not fname: usage() gtf = GTF(fname) gtf_genesize(gtf)
genes, exons, introns, regions, tss, tlss, txs, tlxs, utr_5, utr_3, junc_5, junc_3, promoter ]: if arg: i += 1 if i == 0: usage('You must select one [type] to export.') elif i > 1: usage('You must select *only one* [type] to export.') elif not filename: usage('Missing input file') elif promoter and not (promoter_down or promoter_up): usage('You must specify a valid promoter length!') gtf = GTF(filename) if genes: gtf_genes_tobed(gtf) elif exons: gtf_exons_tobed(gtf) elif introns: gtf_introns_tobed(gtf) elif regions: gtf_regions_tobed(gtf) elif tss: gtf_tss_tobed(gtf) elif tlss: gtf_tlss_tobed(gtf) elif txs: gtf_txs_tobed(gtf)
def testQuery(self): genes = list(ngsutils.gtf.query.gtf_query(GTF(fname, cache_enabled=False), 'chr1', 1000, 2000)) self.assertEquals(str(genes[0]), 'foo1(iso1) chr1:1000-2500[+]')
import unittest import StringIO import ngsutils.gtf.genesize from ngsutils.gtf import GTF gtf = GTF(fileobj=StringIO.StringIO('''\ chr1|test|exon|1001|1100|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|exon|1201|1300|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|exon|1401|1500|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|CDS|1051|1447|0|+|1|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo1"; transcript_id "bar1"; isoform_id "iso1" chr1|test|exon|1001|1100|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|exon|1401|1500|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|CDS|1051|1447|0|+|1|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|start_codon|1051|1053|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|stop_codon|1448|1450|0|+|.|gene_id "foo2"; transcript_id "bar2"; isoform_id "iso1" chr1|test|exon|2001|2100|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2" chr1|test|exon|2201|2300|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2" chr1|test|exon|2401|2500|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2" chr1|test|CDS|2051|2447|0|+|1|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2" chr1|test|start_codon|2051|2053|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2" chr1|test|stop_codon|2448|2450|0|+|.|gene_id "foo3"; transcript_id "bar1"; isoform_id "iso2" '''.replace('|', '\t')), quiet=True) class GTFGeneSizeTest(unittest.TestCase): def testGeneSize(self): out = StringIO.StringIO('') ngsutils.gtf.genesize.gtf_genesize(gtf, out=out)
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True): if gtf_file: gtf = GTF(gtf_file) else: gtf = None sys.stderr.write('Calculating Read stats...\n') stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles] sys.stdout.write('\t') for fname, stat in zip(infiles, stats): sys.stdout.write('%s\t\t' % fname) sys.stdout.write('\n') sys.stdout.write('Reads:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.total) sys.stdout.write('\n') sys.stdout.write('Mapped:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.mapped) sys.stdout.write('\n') sys.stdout.write('Unmapped:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.unmapped) sys.stdout.write('\n') sys.stdout.write('\nFlag distribution\n') validflags = set() maxsize = 0 for flag in flag_descriptions: for stat in stats: if stat.flag_counts.counts[flag] > 0: validflags.add(flag) maxsize = max(maxsize, len(flag_descriptions[flag])) for flag in sorted(validflags): sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag])) for stat in stats: sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total))) sys.stdout.write('\n') sys.stdout.write('\n') if stats[0].tlen_counts: sys.stdout.write('Template length:') for stat in stats: mean, stdev = counts_mean_stdev(stat.tlen_counts) sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev)) sys.stdout.write('\n') sys.stdout.write('\n') stat_tags = {} for tag in stats[0].tagbins: stat_tags[tag] = [] for stat in stats: stat_tags[tag].append(stat.tagbins[tag]) for tag in stat_tags: asc = stats[0].tagbins[tag].asc sys.stdout.write("Ave %s:" % tag) for i, tagbin in enumerate(stat_tags[tag]): sys.stdout.write('\t%s' % tagbin.mean) if i != len(stats): sys.stdout.write('\t') sys.stdout.write('\n') sys.stdout.write("Max %s:" % tag) for i, tagbin in enumerate(stat_tags[tag]): sys.stdout.write('\t%s' % tagbin.max) if i != len(stats): sys.stdout.write('\t') sys.stdout.write('\n') sys.stdout.write('%s distribution:\n' % tag) gens = [] gen_vals = [] last_pcts = [] for stat in stats: gens.append(stat.distribution_gen(tag)) gen_vals.append(None) last_pcts.append(0.0) good = True last = None while good: good = False for i, stat in enumerate(stats): if not gen_vals[i]: try: gen_vals[i] = gens[i].next() except StopIteration: pass vals = [tup[0] for tup in gen_vals if tup] if not vals: continue if asc: minval = min(vals) else: minval = max(vals) if last and type(last) == int and fillin_stats: if asc: last += 1 # fill in missing values while last < minval: sys.stdout.write('%s' % last) for i, stat in enumerate(stats): sys.stdout.write('\t0\t%s' % last_pcts[i]) sys.stdout.write('\n') last += 1 else: last -= 1 # fill in missing values while last > minval: sys.stdout.write('%s' % last) for i, stat in enumerate(stats): sys.stdout.write('\t0\t%s' % last_pcts[i]) sys.stdout.write('\n') last -= 1 last = minval sys.stdout.write(str(minval)) for i, tup in enumerate(gen_vals): if tup and tup[0] == minval: sys.stdout.write('\t%s\t%s' % (tup[1], tup[2])) last_pcts[i] = tup[2] gen_vals[i] = None good = True else: sys.stdout.write('\t0\t%s' % (last_pcts[i])) sys.stdout.write('\n') sys.stdout.write('\n') sys.stdout.write('Reference counts') for stat in stats: sys.stdout.write('\tcount\t') sys.stdout.write('\n') for k in sorted([x for x in stats[0].refs]): sys.stdout.write('%s' % k) for stat in stats: sys.stdout.write('\t%s\t' % stat.refs[k]) sys.stdout.write('\n') if gtf_file: sys.stdout.write('Mapping regions') for stat in stats: sys.stdout.write('\tcount\tCPM') sys.stdout.write('\n') sorted_keys = [x for x in stats[0].regiontagger.counts] sorted_keys.sort() for k in sorted_keys: sys.stdout.write('%s' % k) for stat in stats: sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000)) sys.stdout.write('\n')