Esempio n. 1
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*
'''
Author: Mei Hou
Get the transcript annotation from TangX's V4expCaculate.combined.gtf
Include: 'transcript_id', 'gene_id', 'gene_biotype', 'transcript_type', 'source', 'gene_name', 'transcript_name'
Note: if a annotation type is not existed for a transcript, it is '-'
'''

import sys
import modify_gtf as gtf

curTransId = ''
tAnnoType = ('transcript_id', 'gene_id', 'gene_biotype', 'transcript_type',
             'source', 'gene_name', 'transcript_name')
print '\t'.join(tAnnoType)
for line in sys.stdin.readlines():
    li = line.rstrip('\n').split('\t')
    # 只看exon的,以防其他UTR这些注释不全
    if li[2] == 'exon':
        dAnnos = gtf.processAnnotation(li[8])
        dAnnos['source'] = li[1]
        if curTransId != dAnnos['transcript_id']:
            curTransId = dAnnos['transcript_id']
            lOut = []
            for at in tAnnoType:
                lOut.append(dAnnos.setdefault(at, '-'))
            print '\t'.join(lOut)
Esempio n. 2
0
dClusterAnno = {}
for ca in clusterAnnoFile.readlines():
	ca = ca.rstrip('\n').split('\t')
	dClusterAnno[ca[4]] = [ca[6]] + ca[9:12]


# tidy the intersect file
print '\t'.join(['ClusterID', 'Chromosome', 'Strand', 'ClusterLength', 'OverlapLength', 'TranscriptID', 'BioType', 'Feature', 'GeneName', 'GeneID', 'ReadCout', 'ConversionLocationCount', 'ConversionEventCount', 'NonConversionEventCount'])
intersectFile = open(intersectFile)
for li in intersectFile.readlines():
	li = li.rstrip('\n').split('\t')
	clusterId = li[12]
	feature = li[2]
	
	# need to count by biotype
	dTransBasicAnno = gtf.processAnnotation(li[8])
	transId = dTransBasicAnno['transcript_id']

	# check
	if not transId in dTransAnno:
		sys.stderr.write(transId + ' is not in the annotation file!\n')
		sys.exit(1)
	# check
	if not clusterId in dClusterAnno:
		sys.stderr.write(clusterId + ' is not in the annotation file!\n')
		sys.exit(1)

	transAnno = dTransAnno[transId]
	biotype = transAnno[1]

	clusterAnno = dClusterAnno[clusterId]
Esempio n. 3
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*
import sys
import modify_gtf as gtf
from collections import defaultdict

ddTrans = defaultdict(int)
curTransId = ''
ok = 1
for line in sys.stdin.readlines():
	li = line.rstrip('\n').split('\t')
	dAnnos = gtf.processAnnotation(li[8])
	lineTransId = dAnnos['transcript_id']
	if lineTransId != curTransId:
		curTransId = lineTransId
		ddTrans[curTransId] += 1

for k, v in ddTrans.items():
	if v > 1:
		sys.stderr.write(k + ' is repeated!\n')
		ok = 0

if ok == 1:
	print 'The file is well sorted by transcripts!'
Esempio n. 4
0
# tidy the intersect file
print '\t'.join([
    'ClusterID', 'Chromosome', 'Strand', 'ClusterLength', 'OverlapLength',
    'TranscriptID', 'BioType', 'Feature', 'GeneName', 'GeneID', 'ReadCout',
    'ConversionLocationCount', 'ConversionEventCount',
    'NonConversionEventCount'
])
intersectFile = open(intersectFile)
for li in intersectFile.readlines():
    li = li.rstrip('\n').split('\t')
    clusterId = li[12]
    feature = li[2]

    # need to count by biotype
    dTransBasicAnno = gtf.processAnnotation(li[8])
    transId = dTransBasicAnno['transcript_id']

    # check
    if not transId in dTransAnno:
        sys.stderr.write(transId + ' is not in the annotation file!\n')
        sys.exit(1)
    # check
    if not clusterId in dClusterAnno:
        sys.stderr.write(clusterId + ' is not in the annotation file!\n')
        sys.exit(1)

    transAnno = dTransAnno[transId]
    biotype = transAnno[1]

    clusterAnno = dClusterAnno[clusterId]