def read_transcripts(genes_fn, seqnames): ''' Parse all the transcripts from a GTF file. ''' stderr.write('parsing gtf ... ') genes = defaultdict(transcript) for row in gtf.gtf_file(genes_fn): if row.seqname not in seqnames: continue genes[row.attributes['transcript_id']].add_exon(row) stderr.write('done. ({0} transcripts)\n'.format(len(genes))) return genes.values()
def read_genes(genes_fn): ''' For each gene in the GTF file, compute the union of all exons, in sorted order. ''' stderr.write('parsing GTF file ... '); genes = defaultdict(gene) for row in gtf.gtf_file(genes_fn): if row.feature != 'exon': continue if 'gene_id' not in row.attributes: continue genes[row.attributes['gene_id']].add_row(row) stderr.write('done. ({0} genes)\n'.format(len(genes))) for g in genes.itervalues(): g.flatten() return genes
#!/usr/bin/env python from gtf import gtf_file from sys import argv, stdout, stderr, stdin if len(argv) < 2: stderr.write('useage: gtf_exons.py genes.gtf') for row in gtf_file(argv[1]): if row.feature != 'exon': continue stdout.write('{seqname}\t{start}\t{end}\t{name}\t{score}\t{strand}\n'.format( seqname = row.seqname, start = int(row.start) - 1, end = row.end, name = row.attributes['gene_id'], score = 0, strand = row.strand))