forked from humberto-ortiz/dmel-ercc-diff
/
Tophat_Birnavirus_Confirmer_2.0.py
216 lines (165 loc) · 7.82 KB
/
Tophat_Birnavirus_Confirmer_2.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/home/ivan/.conda/envs/my_root/bin/python
"""
#### Programmer: Ivan Jimenez
#### Universidad de Puerto Rico, Recinto de Rio Piedras
#### BIOL-4990 (Investigacion)
This program verifies that the total count of alingments obtained by running the TopHat program is accurate.
A single FASTA file containing 19 genes was compared to the Drosophila Birnavirus and Xvirus genomes. The Screed
module for Python was used to simplify the comparison (using seperate databases for the genome and scytrim'ed
FASTA file). All viral sequence records that were found are displayed on screen after running this script.
This is a work in progress... Please be patient. The documentation for this script is still sporadic.
BEFORE RE-RUNNING THIS SCRIPT, MAKE SURE YOU DELETE ALL FILES THAT MEET THE FOLLOWING CRITERIA:
- files ENDING IN _screed ......(e.i. <filename>_screed)
- viral_sequences.csv ..........it will not be overwritten or deleted if NO viral sequences are found, delete it to avoid confusion :)
"""
import time as t
import os, os.path
import glob
import platform
global array
import screed
import sys
from screed import ScreedDB
import string
#corriendo = "N"
bioinfo_path = "/Users/ivanjimenez/Desktop/CLASES/INTERNSHIPS/BIOINFO INTERNSHIP FILES/RESULTS/newresults/"
viralgenome_path = bioinfo_path + "copy_birna_x_virus.fa"
screed.read_fasta_sequences("/Users/ivanjimenez/Desktop/CLASES/INTERNSHIPS/BIOINFO INTERNSHIP FILES/RESULTS/newresults/copy_birna_x_virus.fa")
birna_x_virusdb = ScreedDB(viralgenome_path + "_screed")
#Setting the number of mismatches that are allowed...
k = 6
def getpath():
wd = os.path.dirname(os.path.abspath(__file__))
if platform.system() == 'Windows':
array = wd.split('\\')
destination = "\\\\".join(array)
destination += '\\\\'
else:
array = wd.split('//')
destination = "////".join(array)
destination += '////'
return destination
####
# NOT USING THE GETPATH() FUNCTION YET:
####
#Function to find the complement of a strand of DNA
def complementary_strand(strand):
return strand.translate(string.maketrans('TAGCtagc', 'ATCGATCG'))
#Function to count the number of mismatches between two sequences, IF accepted MISMATCH_COUNT is exceeded, stops counting mismatches and returns current value
def mismatch(read , virus_seq):
if len(read) != len(virus_seq):
print "Tried to compare to read sequences that were not the same length: \n\nlength of read = " + len(str(read)) + "\n" + "length of virus = "+ str(len(str(virus_seq)))
else:
mismatch_count = 0
for i in range(0, len(str(virus_seq))):
if str(read[i]) != str(virus_seq[i]):
mismatch_count += 1
elif mismatch_count > k:
break
return mismatch_count
# EXAMPLE:
# CGTAGCGATAGAGAGAGAAGGGACT
# CTGAG
birnaxrecords = []
a=0
for record in birna_x_virusdb.itervalues():
birnaxrecords.append(record)
#print "\nPrinting added record\n"
#print record['sequence']
birnaxrecords[a]['sequence'] = str(record['sequence'])
a+=1
print birnaxrecords
#typing: $ records
# PRODUCES: [{'description': 'Drosophila melanogaster birnavirus SW-2009a strain DBV segment A, complete sequence', 'id': 0, 'name': 'gi|262225302|gb|GQ342962.1|', 'sequence': <_screed_attr 'sequence'>}, {'description': 'Drosophila melanogaster birnavirus SW-2009a strain DBV segment B, complete sequence', 'id': 1, 'name': 'gi|262225305|gb|GQ342963.1|', 'sequence': <_screed_attr 'sequence'>}]
#len(record['sequence'])
# 3014
#record['sequence']
# <_screed_attr 'sequence'>
print "\n\n\n"
#print "Printing a count: all birna records (should equal to 2)...\n"
#print len(birnarecords)
print "\n\n\n"
#print "\nPrinting sequence of last record.\n\t"
#print record['sequence']
print "\nPrinting first record\n\t"
segmentA = birna_x_virusdb[birna_x_virusdb.keys()[0]]
#print segmentA
#segment
# {'description': 'Drosophila melanogaster birnavirus SW-2009a strain DBV segment A, complete sequence', 'id': 0, 'name': 'gi|262225302|gb|GQ342962.1|', 'sequence': <_screed_attr 'sequence'>}
print "\nPrinting second record...\n\t"
segmentB = birna_x_virusdb[birna_x_virusdb.keys()[1]]
#print segmentB
print "\nPrinting third record\n\t"
segmentXA = birna_x_virusdb[birna_x_virusdb.keys()[2]]
#print segmentXA
print "\nPrinting fourth record...\n\t"
segmentXB = birna_x_virusdb[birna_x_virusdb.keys()[3]]
#print segmentXB
screed.read_fasta_sequences("/Users/ivanjimenez/Desktop/CLASES/INTERNSHIPS/BIOINFO INTERNSHIP FILES/RESULTS/newresults/Trinity_de_genes.fa")
alltrinitygenesdb = ScreedDB(bioinfo_path + "Trinity_de_genes.fa_screed")
#print "Now adding all records in scytrimfasta database into an array.\n\nPlease wait..."
#Initializing the array to keep all records of viral sequences found
viral_sequences = []
#Keeping count of how many records have been checked...
iterator = 0
birnasegmentA = birnaxrecords[0]['sequence']
#print "\nPrinting the first segment in reverse:\n"
birnarevsegA = birnasegmentA[::-1]
#print birnarevsegA
#print "\nPrinting the first segment's complement:\n"
birnacomplementA = complementary_strand(birnarevsegA)
#print birnacomplementA
birnasegmentB = birnaxrecords[1]['sequence']
#print "\nPrinting the second segment in reverse:\n"
birnarevsegB = birnasegmentB[::-1]
#print birnarevsegB
#print "\nPrinting the second segment's complement:\n"
birnacomplementB = complementary_strand(birnarevsegB)
#print birnacomplementB
xvirussegmentA = birnaxrecords[2]['sequence']
#print "\nPrinting the first segment in reverse:\n"
xvirusrevsegA = xvirussegmentA[::-1]
#print birnarevsegA
#print "\nPrinting the first segment's complement:\n"
xviruscomplementA = complementary_strand(xvirusrevsegA)
#print birnacomplementA
xvirussegmentB = birnaxrecords[3]['sequence']
#print "\nPrinting the second segment in reverse:\n"
xvirusrevsegB = xvirussegmentB[::-1]
#print birnarevsegB
#print "\nPrinting the second segment's complement:\n"
xviruscomplementB = complementary_strand(xvirusrevsegB)
#print birnacomplementB
#BEGINNING 4 nested FOR LOOPS
#grouping all known virus_segments (including complements) into an array...
virus_segments=(birnasegmentA, birnasegmentB, birnacomplementA, birnacomplementB, xvirussegmentA, xvirussegmentB, xviruscomplementA, xviruscomplementB)
#Loop through ALL 49Million reads
for trinitygene in alltrinitygenesdb.itervalues():
#ISOLATE THE SEQUENCE IN THE RECORD
trinitygeneseq = str(trinitygene['sequence'])
#Loop through ALL 4 viralsegments
for birnax_segment in virus_segments:
#loop through all bases of the segment (or complement)
for i in range(0, (int(len(birnax_segment))-int(len(trinitygeneseq))+1)): #bp in virus
#reset mismatch to 0 for next comparison
mismatches = mismatch(trinitygeneseq , birnax_segment[i:i+len(trinitygeneseq)])
if mismatches <= k:
print "FOUND A VIRAL SEQUENCE WHERE NONE SHOULD BE! Printing it now: "
print trinitygeneseq
viral_sequences.append(trinitygene)
break
if mismatches <=k:
break
#If viral sequence is found, should print a warning and then the sequence, followed by the number of the sequence (out of 49million inspected)
print "Checked record #" + str(int(iterator)+1) + " from array."
iterator += 1
# print "\n\n\n"
# print "Printing the total (count) of all ScyTrim records\n"
# print str(iterator)
print "\n\n\n"
#print "THIS SHOULD PRINT: 1131"
print len(viral_sequences)
#An empty viral_sequences.csv file needs to be created before running this script!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
if len(viral_sequences) != 0:
editingfile = open(bioinfo_path + "viral_sequences.csv", "w+")
editingfile.write(str(viral_sequences))