-
Notifications
You must be signed in to change notification settings - Fork 0
/
transdecoder_to_genome_test.py
executable file
·300 lines (266 loc) · 14.4 KB
/
transdecoder_to_genome_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#!/usr/bin/env python
import unittest
import string
from transdecoder_to_genome import Exon
from transdecoder_to_genome import Transcript
from transdecoder_to_genome import build_coordinates
class GFFLineTest(unittest.TestCase):
def setUp(self):
ret = unittest.TestCase.setUp(self)
exon1 ="TCONS_00001425 . gene 1439 1564 . - . ID=g.11422;Name=ORF%20g.11422%20m.11422%20type%3A5prime_partial%20len%3A42%20%28-%29"
exon2 ="TCONS_00001425 . gene 1043 1231 . + . ID=g.11419;Name=ORF%20g.11419%20m.11419%20type%3Acomplete%20len%3A63%20%28%2B%29"
exon3 ="TCONS_00001425 . gene 1 1062 . + . ID=g.11414;Name=ORF%20g.11414%20m.11414%20type%3A5prime_partial%20len%3A354%20%28%2B%29"
self.actual_exon = Exon("Spolyrrhiza9509S001 Cufflinks exon 26722 26764 . - . Parent=TCONS_00001425")
self.actual_trans = Transcript(exon1)
actual_exon1 = "Spolyrrhiza9509S001 Cufflinks exon 24497 24656 . - . Parent=TCONS_00001425"
actual_exon2 = "Spolyrrhiza9509S001 Cufflinks exon 24809 24938 . - . Parent=TCONS_00001425"
actual_exon3 = "Spolyrrhiza9509S001 Cufflinks exon 26237 26599 . - . Parent=TCONS_00001425"
self.actual_exon1 = Exon(actual_exon1)
self.actual_exon2 = Exon(actual_exon2)
self.actual_exon3 = Exon(actual_exon3)
self.exon1 = Transcript(exon1)
self.exon2 = Transcript(exon2)
self.exon3 = Transcript(exon3)
return ret
"""
Tests comparison on RNAseq format
"""
def testTranscriptComparison(self):
self.assertGreater(self.exon2, self.exon1)
self.assertGreater(self.exon3, self.exon1)
self.assertGreater(self.exon2, self.exon3)
self.assertLess(self.exon1, self.exon2)
self.assertLess(self.exon1, self.exon3)
self.assertLess(self.exon3, self.exon2)
self.assertEqual(self.exon1, self.exon1)
def testBaseGetDicFormat(self):
actual = self.actual_exon.getDicFormat()
expected = {"name": "TCONS_00001425", "start": 26722, "stop": 26764}
self.assertEqual(actual, expected)
def testBaseConstructor(self):
self.assertEqual(self.exon1.start, 1439)
self.assertEqual(self.exon1.stop, 1564)
self.assertEqual(self.exon1.getLabel(), "gene")
self.assertEqual(self.exon1.parent, "TCONS_00001425")
self.assertEqual(len(self.exon1), 125)
self.assertEqual(str(self.exon1), "TCONS_00001425 . gene 1439 1564 . - . ID=g.11422;Name=ORF%20g.11422%20m.11422%20type%3A5prime_partial%20len%3A42%20%28-%29")
self.assertEqual(self.exon1.partial, "partial")
self.assertEqual(self.exon1.strand, "-")
def testBadPositions(self):
tester = "TCONS_00001425 . gene 2000 1564 . - . ID=g.11422;Name=ORF%20g.11422%20m.11422%20type%3A5prime_partial%20len%3A42%20%28-%29"
try:
Exon(tester)
self.fail("No exception thrown")
except:
pass
tester = "TCONS_00001425 . gene 1564 1564 . - . ID=g.11422;Name=ORF%20g.11422%20m.11422%20type%3A5prime_partial%20len%3A42%20%28-%29"
try:
Exon(tester)
self.fail("No exception thrown")
except:
pass
def testSetLenth(self):
self.exon1.setLength(2222)
self.assertEqual(len(self.exon1), 2222)
def testSwitchStrand(self):
self.exon1.switchStrand()
self.assertEquals(self.exon1.strand, "+")
def testGetExonDicFormat(self):
temp = self.actual_trans
temp.addExon(self.actual_exon1)
temp.addExon(self.actual_exon2)
temp.addExon(self.actual_exon3)
actual = temp.getExonDicList()
expected = [{"start": 24497, "stop":24656 ,"name": "TCONS_00001425"}, {"start":24809 , "stop": 24938 ,"name": "TCONS_00001425"}, {"start": 26237, "stop": 26599 ,"name": "TCONS_00001425"}]
self.assertEqual(actual, expected)
"""
The following tests show that the sign showed in the gff file are irrelevant, the coordinates
remain the same regardless.
That is, if a negative sign is shown in the transdecoder.gff3 file, the output coordinates work, and to get the data you must
slice the subsquence from exons.fasta, then reverse transcribe it.
This will output the sequence in the transdecoder.cds file.
"""
class TransDecoderStrandednessTest(unittest.TestCase):
def setUp(self):
ret = unittest.TestCase.setUp(self)
self.plusplusorf, self.plusminusorf, self.minusminusorf, self.minusplusorf = "", "", "", ""
self.plusplusgen, self.plusminusgen, self.minusminusgen, self.minusplusgen = "", "", "", ""
self.trans = string.maketrans("ATCGR", "TAGCY")
with open("test/plusplus.gm", "r") as gm:
for line in gm: self.plusplusgen += line.strip()
with open("test/plusplus.orf", "r") as orf:
for line in orf: self.plusplusorf += line.strip()
with open("test/plusminus.gm", "r") as gm:
for line in gm: self.plusminusgen += line.strip()
with open("test/plusminus.orf", "r") as orf:
for line in orf: self.plusminusorf += line.strip()
with open("test/minusplus.gm", "r") as gm:
for line in gm: self.minusplusgen += line.strip()
with open("test/minusplus.orf", "r") as orf:
for line in orf: self.minusplusorf += line.strip()
with open("test/minusminus.gm", "r") as gm:
for line in gm: self.minusminusgen += line.strip()
with open("test/minusminus.orf", "r") as orf:
for line in orf: self.minusminusorf += line.strip()
return ret
def testCorrectPlusPlus(self):
"""
Plus Plus works exactly as expected. No reverse transcription required. ATG appears immediately
at final[522:3012], and the sequence in the transdecoder.cds match the subsequence.
"""
gene = {'start': 74641, 'stop': 78761, 'name':'plusplus',
'exons': ({ 'start': 74641, 'stop': 76014 } ,
{ 'start': 76111, 'stop': 76277 } ,
{ 'start': 76368, 'stop': 76465 } ,
{ 'start': 76558, 'stop': 76914 } ,
{ 'start': 77029, 'stop': 77445 } ,
{ 'start': 77628, 'stop': 78761 }
)}
transcript = {'name':'plusplus', 'start': 523, 'stop':3012}
cds = build_coordinates(gene, transcript) #??? what should this do...
# should equal start-1:stop
self.assertEqual(self.plusplusgen[522:3012], self.plusplusorf)
def testPlusMinus(self):
# + - CASE FOR PARTIAL
"""
final[878:1190][::-1].translate(transtab) where final is the exons.fasta for something that was originally +.
and the gff3 file said it is negative works correctly. that is, our coordinates provided by the gff3 file are
correct for + in org - in the new. This means to get the sequence it needs to be reverse transcribed, but the
coordinate themselves are correct.
TCONS_00000005
"""
gene = {'start': 67878, 'stop': 74308, 'name':'plusmin',
'exons': ({ 'start': 67878, 'stop': 68085 } ,
{ 'start': 68882, 'stop': 68943 } ,
{ 'start': 69034, 'stop': 69141 } ,
{ 'start': 69683, 'stop': 69762 } ,
{ 'start': 71555, 'stop': 71717 } ,
{ 'start': 72187, 'stop': 72261 } ,
{ 'start': 72370, 'stop': 72458 } ,
{ 'start': 73151, 'stop': 73270 } ,
{ 'start': 73918, 'stop': 73962 } ,
{ 'start': 74066, 'stop': 74308 }
)}
transcript = {'name':'plusmin', 'start': 879, 'stop': 1190}
#reverse
self.assertEqual(self.plusminusgen[878:1190][::-1].translate(self.trans), self.plusminusorf)
def testMinusPlus(self):
# - + case
"""
Minus plus immediately finds the ATG at the start of the genome sequence, this is as expected,
as the gff3 provided by transdecoder said +, and the genome sequence in exons.fasta and the
transdecoder coding sequence agree.
found at final[0:957]
TCONS_00001426
"""
gene = {'start': 26167, 'stop': 27852, 'name':'minplus',
'exons': ({ 'start': 26167, 'stop': 26599 } ,
{ 'start': 26722, 'stop': 26764 } ,
{ 'start': 27215, 'stop': 27494 } ,
{ 'start': 27650, 'stop': 27852 }
)}
transcript = {'name':'minplus', 'start': 1, 'stop': 957}
#forward
self.assertEqual(self.minusplusgen[0:957], self.minusplusorf)
def testMinusMinus(self):
# - - case
"""
final[1003:1327][::-1].translate(trans) where final is the exons.fasta for somethinga that was originally -.
The gff3 file from transdecoder also said -, which means to reverse transcribe the fasta file to get what was
in the coding sequence. These are the coordinates found in the gff3, so we know they are correct.
TCONS_00015061
"""
gene = {'start': 5731958, 'stop': 5744344, 'name':'minmin',
'exons': ({ 'start': 5731958, 'stop': 5732148 } ,
{ 'start': 5732445, 'stop': 5733573 } ,
{ 'start': 5733673, 'stop': 5733929 } ,
{ 'start': 5735453, 'stop': 5736066 } ,
{ 'start': 5736111, 'stop': 5736553 } ,
{ 'start': 5736642, 'stop': 5736953 } ,
{ 'start': 5737039, 'stop': 5738345 } ,
{ 'start': 5738598, 'stop': 5738961} ,
{ 'start': 5739248, 'stop': 5739641 } ,
{ 'start': 5739724, 'stop': 5740829 } ,
{ 'start': 5741250, 'stop': 5743397 } ,
{ 'start': 5743651, 'stop': 5744344 }
)}
transcript = {'name':'minmin', 'start': 1004, 'stop':1327}
#reverse
self.assertEqual(self.minusminusgen[1003:1327][::-1].translate(self.trans),self.minusminusorf)
class BuildCoordinateTest(unittest.TestCase):
"""
previous bugs found:
If the transcript coordinate was longer than the physical sequence it would break,
off by 2 in 3prime partial and on the last exon.
CDS would skil the 2nd to last place and put it at the end
Transcript names were incorrect (off by one or so)
"""
def setUp(self):
ret = unittest.TestCase.setUp(self)
#101 - 1000 length of 900
self.gene = { "start": 101, "stop": 1000, "name": "off1end", "exons" :
[{ "start": 101, "stop": 105}, #1-5
{ "start": 111, "stop": 200}, #6-95
{ "start": 225, "stop": 275}, #96-146
{ "start": 301, "stop": 700}, #147-546
{ "start": 801, "stop": 900}, #547-646
{ "start": 999, "stop": 1000}]} #647
return ret
def testPerfectMatch(self):
trans = {"start": 1, "stop": 647, "name":"off1end"}
cds = build_coordinates(self.gene, trans)
self.assertEqual(self.gene['exons'], cds, "Full gene does not match.") #should map perfectly
def testTooLong(self):
trans = {"start": 1, "stop": 648, "name":"off1end"}
cds = build_coordinates(self.gene, trans)
self.assertEqual(self.gene['exons'], cds, "Did not correct for an input that goes past maximum length")
def testTooShort(self):
trans = {"start": 1, "stop": 646, "name":"off1end"}
cds = build_coordinates(self.gene, trans)
expect = [{ "start": 101, "stop": 105},
{ "start": 111, "stop": 200},
{ "start": 225, "stop": 275},
{ "start": 301, "stop": 700},
{ "start": 801, "stop": 900}
]
self.assertEqual(expect, cds, "included last region") #should map off by one
def testMiddle(self):
trans = {"start": 147, "stop": 550, "name":"off1end"}
cds = build_coordinates(self.gene, trans)
expect = [{ "start": 301, "stop": 700},
{ "start": 801, "stop": 804}
]
self.assertEqual(expect, cds, "included last region")
def testSingleExon(self):
trans = {"start":6, "stop": 48, "name":"off1end"}
cds = build_coordinates(self.gene, trans)
expect = [{"start": 111, "stop":153}]
self.assertEqual(expect, cds)
def testMismatchNames(self):
trans = {"start":1, "stop": 648, "name":"not a name"}
try:
cds = build_coordinates(self.gene,trans)
self.fail("expected exception for name mismatch.")
except:
pass
def testNegativeNumbers(self):
trans = {"start": -200, "stop": 400, "name": "off1end"}
try:
cds = build_coordinates(self.gene,trans)
self.fail("expected exception for negative number.")
except:
pass
trans = {"start": 200, "stop": -400, "name": "off1end"}
try:
cds = build_coordinates(self.gene, trans)
self.fail("expected exception for negative number")
except:
pass
trans = {"start": -200, "stop": -100, "name": "off1end"}
try:
cds = build_coordinates(self.gene, trans)
self.fail("expected exception for negative number")
except:
pass
if __name__ == "__main__":
unittest.main()