/
getallcDNAseqs.py
143 lines (111 loc) · 4.43 KB
/
getallcDNAseqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#Given a gff annotation, make a fasta for all transcript sequences.
import gffutils
import os
from Bio import SeqIO
import argparse
import gzip
#tags we dont want
#cds_end_NF, cds_start_NF, mRNA_end_NF, mRNA_start_NF, TSL = 4, TSL = 5
def filtertranscripts(transcript, db):
attributepass = False
tslpass = False
atts = transcript.attributes
if 'tag' not in atts or ('mRNA_end_NF' not in atts['tag'] and 'mRNA_start_NF' not in atts['tag'] and 'cds_start_NF' not in atts['tag'] and 'cds_start_NF' not in atts['tag']):
attributepass = True
try:
tsl = transcript.attributes['transcript_support_level'][0]
if tsl == 'NA':
pass
elif int(tsl) <= 3:
tslpass = True
except KeyError:
pass
if attributepass and tslpass:
return True
else:
return False
def getcDNAcoords(gff, usefilters):
cDNAcoords = {} #{ENSTRANS_chrm_strand: [[exon1start, exon1stop], [exon2start, exon2stop]]}
genecount = 0
#Make gff database
print 'Indexing gff...'
gff_fn = gff
db_fn = os.path.basename(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print 'Done indexing!'
genes = db.features_of_type('gene')
for gene in genes:
genecount +=1
if genecount % 10000 == 0:
print 'Gene {0}...'.format(genecount)
for transcript in db.children(gene, featuretype = 'transcript', order_by = 'start', level = 1):
transcriptpassesfilters = filtertranscripts(transcript, db)
if usefilters and not transcriptpassesfilters:
continue
transcriptname = str(transcript.id) + '_' + str(transcript.chrom) + '_' + str(transcript.strand)
exoncoords = [] #[[exon1start, exon1stop], [exon2start, exon2stop]]
for exon in db.children(transcript, featuretype = 'exon', order_by = 'start', level = 1):
exoncoords.append([exon.start, exon.end])
if exoncoords:
cDNAcoords[transcriptname] = exoncoords
return cDNAcoords
def cDNAcoordstoseq(cDNAcoords, genomefasta, outputfasta):
cDNAseqs = {} #{ENSTRANS : seq}
print 'Indexing genome sequence...'
seq_dict = SeqIO.to_dict(SeqIO.parse(gzip.open(genomefasta), 'fasta'))
print 'Done indexing!'
chrmswithoutseq = [] #Chromsome names that are in allCDScoords but that don't have a fasta entry in genomefasta
for transcript in cDNAcoords:
cDNAseq = ''
#Ensembl vs UCSC transcript names
#Ensembl
if 'NM_' not in transcript and 'NR_' not in transcript:
tname = transcript.split('_')[0].replace('transcript:', '') #in cDNAcoords txnames are like 'transcript:ENSMUST00000038375_chr2_+'
chrm = transcript.split('_')[1]
strand = transcript.split('_')[2]
#UCSC
elif 'NM_' in transcript or 'NR_' in transcript:
if ':' in transcript:
tname = transcript.split('_chr')[0].split(':')[1] #transcript:NM_175684_chr18_-
else:
tname = transcript.split('_chr')[0]
chrm = transcript.split('_')[-2]
strand = transcript.split('_')[-1]
#Is this chromosome in genomefasta?
if chrm not in seq_dict:
if chrm not in chrmswithoutseq:
print 'WARNING: No entry for chromosome {0} in genomefasta.'.format(chrm)
print transcript
chrmswithoutseq.append(chrm)
continue
for exon in cDNAcoords[transcript]:
start = exon[0]
end = exon[1]
if strand == '+':
exonseq = seq_dict[chrm].seq[start-1:end].upper()
cDNAseq += exonseq
elif strand == '-':
exonseq = seq_dict[chrm].seq[start-1:end].reverse_complement().upper()
newseq = exonseq + cDNAseq
cDNAseq = newseq
cDNAseqs[tname] = cDNAseq
print 'Found sequence data for {0} of {1} transcripts.'.format(len(cDNAseqs), len(cDNAcoords))
with open(outputfasta, 'w') as f:
for tname in cDNAseqs:
seq = str(cDNAseqs[tname])
#If transcript IDs have a dot (like ENSMUST0000046543.2)
if '.' in tname:
tname = tname.split('.')[0]
f.write('>' + tname + '\n' + seq + '\n')
return cDNAseqs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gff', type = str, help = 'GFF annotation of transcripts.')
parser.add_argument('--genomefasta', type = str, help = 'Genome sequence in fasta format.')
parser.add_argument('--outputfasta', type = str, help = 'Output file for cDNA sequences.')
parser.add_argument('--usefilters', action = 'store_true', help = 'Do you want all transcripts or only those that have reasonable evidence?')
args = parser.parse_args()
cDNAcoords = getcDNAcoords(args.gff, args.usefilters)
cDNAcoordstoseq(cDNAcoords, args.genomefasta, args.outputfasta)