/
orf_finder.py
executable file
·173 lines (157 loc) · 6.12 KB
/
orf_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/python
''' This script helps to fix errors of protein annotation.
The hypothesis behind this script is that orthologous multidomain proteins existed long long ago.
If the protein lacks several domains that exist in its vertebrate ortholog,
the script searches protein domains in the adjacent genome region.
Protein sequence, genome region sequence and domain name are used as input.
Currently the sequence is sent to SMART for prediction
'''
# 1. find orfs in sequence
# 2. prepare big fasta file with proteins
# 3. run SMART prediction
# 4. parse results
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import os
import sys
from optparse import OptionParser
from StringIO import StringIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Blast.Applications import NcbitblastnCommandline
from Bio.Blast import NCBIXML
parser = OptionParser()
parser.add_option("-d", "--domain", dest="domain", default="PH",
help="SMART domain name to search for")
parser.add_option("-f", "--genomic-file", dest="filename",
help="fasta file with genomic sequence")
parser.add_option("-r", "--range", dest="range", default="1...",
help="range in format 1..10000, full by default")
parser.add_option("-l", "--min-length", dest="min_pro_len", default=50,
help="minimal protein length in ORF")
(opts, args) = parser.parse_args()
##########################################
# initial parameters
##########################################
# domain name we are searching for
domain_name = opts.domain
# file with genomic sequence in fasta format
genomic_filename = opts.filename
# range
seq_range = opts.range.split("..")
# minimal protein length
min_pro_len = int(opts.min_pro_len)
# filename to store translated proteins combined in one
protein_filename = "protein.fasta"
# directory to store SMART results
smart_results_dir = "../Results/SMART_results/"
# part of protein allowed to use for SMART prediction
n = 30000
# threshold for SMART prediction
E_VALUE_THRESH = 10
def parse_smart_domains(file_name):
in_handle = open(file_name, "rU")
lines = in_handle.readlines()
domains = []
domain_status = False
domain_type = False
is_domain = False
for line in lines:
line = line.rstrip()
if len(line) > 1:
pairs = line.split("=")
is_domain = True
if len(pairs) == 2:
if pairs[0] == "DOMAIN":
domain_name = pairs[1]
elif pairs[0] == "START":
domain_start = int(pairs[1])
elif pairs[0] == "END":
domain_end = int(pairs[1])
elif pairs[0] == "TYPE":
if pairs[1] != "PFAM":
domain_type = True
else:
domain_type = False
elif pairs[0] == "STATUS":
if pairs[1] == "visible|OK":
domain_status = True
#False
else: domain_status = True
else:
is_domain = False
else:
if is_domain & domain_type & domain_status:
d = SeqFeature(FeatureLocation(domain_start, domain_end), type="Region")
d.qualifiers = {'region_name': [domain_name]}
if domain_name != 'low_complexity_region':
domains.append(d)
is_domain = False
domain_type = False
domain_status = False
in_handle.close()
return domains
in_handle = open(genomic_filename, "rU")
record = SeqIO.read(in_handle, "fasta")
in_handle.close()
if seq_range[1] != ".":
seq = record.seq[int(seq_range[0]):int(seq_range[1])+1]
else:
seq = record.seq
table = 1
i = 0
protein_obj = SeqRecord("")
protein_obj.id = "proteinfull"
protein_obj.description = "translated_from_" + record.id
print "finding domains of minimal length "+str(min_pro_len)+"..."
print "=============================="
for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
for frame in range(3):
for pro in nuc[frame:].translate(table).split("*"):
if len(pro) >= min_pro_len:
# print "%s...%s - length %i, strand %i, frame %i" % (pro[:30], pro[-3:], len(pro), strand, frame)
protein_obj.seq += pro
if len(protein_obj.seq) == 0:
print "No ORFs found in sequence or you have problems with it"
sys.exit()
elif len(protein_obj.seq) > n:
print "Translation of found ORFs in selected genomic region is larger than\t", n
print "Please, narrow it"
sys.exit()
else:
out_handle = open(protein_filename, "w")
SeqIO.write(protein_obj, out_handle, "fasta")
out_handle.close()
try:
os.remove(smart_results_dir + protein_obj.id + "_SMART_results.txt")
except OSError:
pass
print "submitting to SMART..."
print "=============================="
os.system("perl SMART_batch.pl --outputDirectory " + smart_results_dir + " --inputFile " + protein_filename)
print "Aligning SMART domains to initial genomic sequence..."
print "=============================="
domains = parse_smart_domains(smart_results_dir + protein_obj.id + "_SMART_results.txt")
found = False
for d in domains:
if d.qualifiers['region_name'][0] == domain_name:
found = True
pro = SeqRecord(d.extract(protein_obj.seq))
print domain_name, "\t", d.extract(protein_obj.seq)
out_handle = open("domain.fasta", "w")
SeqIO.write(pro, out_handle, "fasta")
out_handle.close()
# blast the sequences
output = NcbitblastnCommandline(query="domain.fasta", subject=genomic_filename, outfmt=5)()[0]
# parse output
blast_result_record = NCBIXML.read(StringIO(output))
query_start_list = []
if len(blast_result_record.alignments) > 0:
alignment = blast_result_record.alignments[0]
hsps = []
for hsp in alignment.hsps:
# filter garbage
if hsp.expect < E_VALUE_THRESH:
hsps.append(hsp)
print hsp
if not found:
print "No domains found. Try again:)"