#!/usr/bin/env python2.7 # -*- coding: utf-8 -* ''' Author: Mei Hou Get the transcript annotation from TangX's V4expCaculate.combined.gtf Include: 'transcript_id', 'gene_id', 'gene_biotype', 'transcript_type', 'source', 'gene_name', 'transcript_name' Note: if a annotation type is not existed for a transcript, it is '-' ''' import sys import modify_gtf as gtf curTransId = '' tAnnoType = ('transcript_id', 'gene_id', 'gene_biotype', 'transcript_type', 'source', 'gene_name', 'transcript_name') print '\t'.join(tAnnoType) for line in sys.stdin.readlines(): li = line.rstrip('\n').split('\t') # 只看exon的,以防其他UTR这些注释不全 if li[2] == 'exon': dAnnos = gtf.processAnnotation(li[8]) dAnnos['source'] = li[1] if curTransId != dAnnos['transcript_id']: curTransId = dAnnos['transcript_id'] lOut = [] for at in tAnnoType: lOut.append(dAnnos.setdefault(at, '-')) print '\t'.join(lOut)
dClusterAnno = {} for ca in clusterAnnoFile.readlines(): ca = ca.rstrip('\n').split('\t') dClusterAnno[ca[4]] = [ca[6]] + ca[9:12] # tidy the intersect file print '\t'.join(['ClusterID', 'Chromosome', 'Strand', 'ClusterLength', 'OverlapLength', 'TranscriptID', 'BioType', 'Feature', 'GeneName', 'GeneID', 'ReadCout', 'ConversionLocationCount', 'ConversionEventCount', 'NonConversionEventCount']) intersectFile = open(intersectFile) for li in intersectFile.readlines(): li = li.rstrip('\n').split('\t') clusterId = li[12] feature = li[2] # need to count by biotype dTransBasicAnno = gtf.processAnnotation(li[8]) transId = dTransBasicAnno['transcript_id'] # check if not transId in dTransAnno: sys.stderr.write(transId + ' is not in the annotation file!\n') sys.exit(1) # check if not clusterId in dClusterAnno: sys.stderr.write(clusterId + ' is not in the annotation file!\n') sys.exit(1) transAnno = dTransAnno[transId] biotype = transAnno[1] clusterAnno = dClusterAnno[clusterId]
#!/usr/bin/env python2.7 # -*- coding: utf-8 -* import sys import modify_gtf as gtf from collections import defaultdict ddTrans = defaultdict(int) curTransId = '' ok = 1 for line in sys.stdin.readlines(): li = line.rstrip('\n').split('\t') dAnnos = gtf.processAnnotation(li[8]) lineTransId = dAnnos['transcript_id'] if lineTransId != curTransId: curTransId = lineTransId ddTrans[curTransId] += 1 for k, v in ddTrans.items(): if v > 1: sys.stderr.write(k + ' is repeated!\n') ok = 0 if ok == 1: print 'The file is well sorted by transcripts!'
# tidy the intersect file print '\t'.join([ 'ClusterID', 'Chromosome', 'Strand', 'ClusterLength', 'OverlapLength', 'TranscriptID', 'BioType', 'Feature', 'GeneName', 'GeneID', 'ReadCout', 'ConversionLocationCount', 'ConversionEventCount', 'NonConversionEventCount' ]) intersectFile = open(intersectFile) for li in intersectFile.readlines(): li = li.rstrip('\n').split('\t') clusterId = li[12] feature = li[2] # need to count by biotype dTransBasicAnno = gtf.processAnnotation(li[8]) transId = dTransBasicAnno['transcript_id'] # check if not transId in dTransAnno: sys.stderr.write(transId + ' is not in the annotation file!\n') sys.exit(1) # check if not clusterId in dClusterAnno: sys.stderr.write(clusterId + ' is not in the annotation file!\n') sys.exit(1) transAnno = dTransAnno[transId] biotype = transAnno[1] clusterAnno = dClusterAnno[clusterId]