-
Notifications
You must be signed in to change notification settings - Fork 0
/
Varying_guide_lengths_alignments.py
266 lines (204 loc) · 7.77 KB
/
Varying_guide_lengths_alignments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#Oct 31 2016
#this program just creates alignment files
import os
import sys
import argparse
import tables
import string
import numpy as np
import cProfile
dna_comp = None
import fileinput
def comp(seq_str):
"""complements the provided DNA sequence and returns it"""
global dna_comp
if dna_comp is None:
dna_comp = string.maketrans("ATCGMRWSYKNatcgmrwsykn",
"TAGCKYWSRMNtagckywsrmn")
return seq_str.translate(dna_comp)
def revcomp(seq_str):
"""returns reverse complement of provided DNA sequence"""
return comp(seq_str)[::-1]
def str_from_nparray(vals):
"""converts a numpy array into a sequence string"""
return "".join(chr(x) for x in vals)
def get_seq(chrom,start,end,strand,genome_file):
if chrom not in genome_file.root:
known_chrom = [node.name for node in genome_file.root]
raise ValueError("unknown chromosome %s, possible chromosomes are: "
+ ",".join(known_chrom))
# numpy array representation of sequence and convert to string
chrom_node = genome_file.getNode("/%s" %chrom)
np_seq = chrom_node[start-1:end]
seq_str = str_from_nparray(np_seq)
if strand == "-":
# reverse-complement sequence
seq_str = revcomp(seq_str)
return seq_str
def finding_guides(seq, inStart, inEnd,gRNA_length):
guideRna = []
#has list with RNAs with proper direction
guideRnaOutput = []
location = []
direction = []
i = 0
while i < (len(seq)-1):
if seq[i] == "G":
#print "HI"
if seq[i+1] == "G":
#print i
#only if we are at least 23 bases in can we store a guide RNA, so check and store in list
if i >= gRNA_length+1:
if seq[i-(gRNA_length+1)] == "G":
guideRna.append(seq[i-(gRNA_length+1):i-1])
guideRnaOutput.append(seq[i-(gRNA_length+1):i-1])
#guideRnaPam.append(seq[i-21:i-1] + " " + seq[i-1:i+2])
start = (i - (gRNA_length+1)) + inStart
end = (i - 1) + inStart
location.append(str(start) + '-' + str(end))
direction.append('fwd')
#increment counter
i = i + 1
## add loop to look for CCN start once found take reverse complement
w = 0
forward = ""
while w < (len(seq)-1):
if seq[w] == "C":
if seq[w+1] == "C":
#print w
#print len(seq)
#only if we are at least 23 bases in can we store a guide RNA, so check and store in list
if (w+(gRNA_length+3)) <= (len(seq)):
#add check here for N
forward = seq[w+3: w+(gRNA_length+3)]
#print forward
CODE={'A':'T','T':'A','C':'G','G':'C','N':'N'}
minus_seq=''
for c in forward:
minus_seq=minus_seq+CODE[c]
reverse_seq=minus_seq[::-1]
#print reverse_seq
if reverse_seq[:1] == "G":
guideRnaOutput.append(reverse_seq)
guideRna.append(forward)
#guideRnaPam.append(seq[w:w+3] + " " + seq[w+3:w+23])
start = w+3+inStart
end = w+(gRNA_length+3)+inStart
location.append(str(start) + '-' + str(end))
direction.append('rev')
#increment counter
w = w + 1
#print guideRna
#print guideRnaPam
#print guideRnaOutput
#print location
#print direction
b = 0
while b < len(guideRna):
if "N" in guideRna[b]:
del guideRna[b]
del guideRnaOutput[b]
del location[b]
del direction[b]
b = b + 1
else:
b = b +1
#write to file
print len(guideRna)
return guideRna, guideRnaOutput, location, direction
def main():
if len(sys.argv) < 2:
sys.stderr.write("usage: %s <chrom> [<start> <end>]\n" % sys.argv[0])
exit(2)
inChrom = sys.argv[1]
if len(sys.argv) > 2:
inStart = int(sys.argv[2])
inEnd = int(sys.argv[3])
else:
start = ""
end = ""
sys.stderr.write("%s %d %d\n" % (inChrom, inStart, inEnd))
inStrand = "+"
#rename depending on input gene
fastainput = open("Input_GATA3.fasta", 'w')
#change path to hdf5 file you want to use
seq_h5 = tables.openFile("/iblm/netapp/data1/external/GRC37/GRC37.h5", "r")
inputRegion = get_seq(inChrom, inStart, inEnd, inStrand, seq_h5)
#print inputRegion
fastainput.write(">" + inChrom + '\n')
fastainput.write(inputRegion)
fastainput.close()
#reopen the fasta input file from above
f = open("Input_GATA3.fasta", "r")
#declare chr start and end for region that you want to extract guides from
file = f.readlines()
#Sguide = []
#guideList = []
#create output file
#rename depending on gene of interest
outfile = open('GuideRNAs_GATA3','w')
outfile2 = open('FQGuides_GATA3.fq' , 'w')
#can change seq and header to lists if there are more than one header in fasta file with several sequences
#declare empty lists
sequence = []
header = []
seq = ""
header = ""
GC = []
#store all the headers in a list
for f in file:
if f.startswith('>'):
header = header + f
header = header.splitlines()[0]
#get ride of new line charaters and spaces
else:
f = f.replace(" ", "")
f = f.replace("\n", "")
seq = seq + f
#i = 0
#make it all upper case, easier to parse
seq = seq.upper()
#print guideRna
#call function to find all guide RNAs
gRNA_length = 20
guideRna1, guideRnaOutput1, location1, direction1 = finding_guides(seq,inStart, inEnd,gRNA_length)
gRNA_length = 19
guideRna2,guideRnaOutput2, location2, direction2 = finding_guides(seq,inStart, inEnd,gRNA_length)
gRNA_length = 21
guideRna3, guideRnaOutput3, location3, direction3 = finding_guides(seq,inStart, inEnd,gRNA_length)
guideRna = guideRna1 + guideRna2 + guideRna3
guideRnaOutput = guideRnaOutput1 + guideRnaOutput2 + guideRnaOutput3
location = location1 + location2 + location3
direction = direction1 + direction2 + direction3
print guideRna
print guideRnaOutput
print len(guideRna)
#create .fq file used for alignment
j = 0
outfile.write(header)
while j<len(guideRna):
outfile.write(guideRna[j] + '\n')
outfile2.write('@' + str(j) + '\n')
outfile2.write(guideRna[j] + '\n')
outfile2.write('+' + '\n')
score = "I"*len(guideRna[j])
outfile2.write(score + '\n')
j = j+1
#if we don't close and reopen FQGuides.fq we cannot excute the commands since FQGuides.fq was only a write file
outfile2.close()
#reopen the .fq file created above
open("FQGuides_GATA3.fq", "r")
print "We created a .fq file with all the guides"
#call bwa aligner from this code and get information from the sam file
#bottleneck here
#change input reference genome and output file name here
cmd = 'bwa aln -o 0 -n 3 -N -t 5 hg37.fa FQGuides_GATA3.fq > aln_GATA3.sai'
os.system(cmd)
print "Alignment Step has finished"
#change input reference genome and output file name here
cmd2 = 'bwa samse -n 1000 hg37.fa aln_GATA3.sai FQGuides_GATA3.fq > aln_GATA3.sam'
os.system(cmd2)
print "Sam file has been created"
import re
seq_h5.close()
main()