-
Notifications
You must be signed in to change notification settings - Fork 0
/
Blast2GTF.py
executable file
·192 lines (168 loc) · 7.62 KB
/
Blast2GTF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/local/bin/python
"""This is being rewritten to make a single cds feature the length of the top blast hit.
Chimeric transcripts are going to be a big problem for this approach, but I can check for
them independently (by comparing cds and exon length) and break them up if necessary before
doing the expression analysis
I am going to modify this to parse start, end and strand info from the fasta header of aa
sequences from OrfPredictor (http://proteomics.ysu.edu/tools/OrfPredictor.html)
The new plan (10 Oct 2013) is to record top non-overlapping blast hits as "blast_hit" features
and the surrounding ORFs as CDS features. Exon info can be merged from cufflinks later.
If there are multiple hits to the same target sequence, a warning will be issued.
It there are multiple hits to different targets and they don't overlap, all will be written
Now (15 Oct) I'm going to refactor this to work with a SQLite database of blast results.
Now (22 Oct) I'm going to refactor it to work with MySQL (all of the data is now in a MySQL db
and it's actually indexed so the lookups are fast.
This is also going to involve a bit more of a rewrite because the sequences are in the db
"""
import sys, warnings , logging
import getopt
import csv
import MySQLdb as mdb
from os import path
import cPickle as pickle
from Heathpy import flatten_GTF, get_orf_coords, warning_on_one_line, make_db
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def main(argv):
gtf_filename = ''
species = ''
seqfilename = ''
usage = 'Blast2GTF.py -n <species_name> -g <gtf_outfile> -s <seq_file>'
try:
opts, args = getopt.getopt(argv,"hn:g:s:",["name=", "gtf=", "seqfile="])
if not opts:
raise getopt.GetoptError('no opts')
except getopt.GetoptError:
print usage
sys.exit(2)
for opt, arg in opts:
if opt == "-h":
print usage
sys.exit()
elif opt in ("-n", "--name"):
species = arg
elif opt in ("-s", "--seqfile"):
seqfilename = arg
elif opt in ("-g", "--gtf"):
gtf_filename = arg
con = mdb.connect('localhost', 'root', '', 'Selaginella');
with con:
#make a list of names to ensure that there are no duplicates
name_list = {}
#read dictionary of cluster membership (It would be far better to put this info into the db, but this is working, so I won't mess with it
cluster_info = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "RefSeq", "SeqClusters.p")
seq_groups = pickle.load( open( cluster_info, "rb" ) )
if gtf_filename:
outfile = open(gtf_filename, 'wb')
gtf_writer = csv.writer(outfile, delimiter='\t', quotechar='', quoting=csv.QUOTE_NONE)
if seqfilename:
seqfile = open(seqfilename, 'wb')
cur = con.cursor()
cur.execute("SELECT a.seqid, b.sequence FROM Species a, Ortholog_groups b WHERE a.seqid = b.seqid AND a.species= %s", (species))
#cur.execute("SELECT seqid, sequence FROM Ortholog_groups b WHERE seqid = 'UNCcomp100582_c0_seq1'")
rows = cur.fetchall()
for (seqid, seq) in rows:
try:
seq_record = SeqRecord(Seq(seq))
except TypeError:
warnings.warn("Can't create seq object from %s for %s" % (seq, seqid))
continue
if gtf_filename: #Write blast info to GTF file
hit_list = {}
hit_order = []
cur.execute("SELECT * FROM BLAST WHERE qseqid=%s", (seqid,))
for (id, qseqid, qlen, sacc, slen, pident, length, mismatch, gapopen, qstart, qend, qframe, sstart, send, sframe, evalue, bitscore, strand, hitnum) in cur.fetchall():
feature = {
'source': '1kp',
'feature': 'blast_hit',
'frame': '.',
'seqname': qseqid,
'score': float(bitscore),
'start': int(qstart),
'end': int(qend)
}
#Add strand information and reverse coordinates if on negative strand
if strand == '1':
feature['strand'] = '+'
elif strand == '0':
feature['strand'] = '-'
else:
sys.exit("Strand %s not recognized" % strand)
#In cases where multiple hits to the same subject, combine by maximizing coordinates
if sacc in hit_list:
if hit_list[sacc]['start'] > feature['start']:
hit_list[sacc]['start'] = feature['start']
if hit_list[sacc]['end'] < feature['end']:
hit_list[sacc]['end'] = feature['end']
else:
hit_order.append(sacc)
hit_list[sacc] = feature
#Scan through hits in order and write CDS features for non-overlapping ones
for i in range(len(hit_order)):
overlap = 0
sacc = hit_order[i]
feature = hit_list[sacc]
for j in range(i):
overlap = max(overlap, hit_overlap(feature, hit_list[hit_order[j]]))
if not overlap:
if sacc in seq_groups:
name = "%s_c%s_0" % (qseqid[0:qseqid.find('comp')], seq_groups[sacc].split("_")[1])
else:
name = sacc + "_0"
name_num = 1
while name in name_list:
name = name.split("_")[0:-1] + [str(name_num)]
name = "_".join(name)
name_num = name_num + 1
name_list[name] = 1
feature['gene_id'] = name
feature['transcript_id'] = feature['gene_id'] + '.1'
feature['feature'] = 'CDS'
feature['score'] = '.'
if feature['strand'] == '+':
(feature['start'], feature['end']) = get_orf_coords(seq_record, feature['start'], feature['end'])
feature['frame'] = feature['start'] % 3 + 1
#print "feature start, feature end = %s, %s" % ( feature['start'], feature['end'] )
note = orf_integrity(Seq(seq[feature['start']-1:feature['end']]))
if note: feature['note'] = note
else:
(orf_start, orf_end) = get_orf_coords(seq_record.reverse_complement(), len(seq_record) - qend + 1, len(seq_record) - qstart + 1)
(feature['start'], feature['end']) = (len(seq_record) - orf_end + 1, len(seq_record) - orf_start + 1)
feature['frame'] = ( len(seq_record) - feature['end'] + 1 ) % 3 + 1
orf_rev = Seq(seq[feature['start']-1:feature['end']] )
note = orf_integrity(orf_rev.reverse_complement())
if note: feature['note'] = note
gtf_writer.writerow(flatten_GTF(feature))
if seqfilename:
seqfile.write(">%s\n%s\n" % (seqid, seq))
def hit_overlap(hit1, hit2):
overlap = 0
#print "Comparing hit1 (%i - %i) to hit2 (%i - %i)" % (hit1['start'], hit1['end'], hit2['start'], hit2['end'])
if hit1['start'] >= hit2['start'] and hit1['start'] <= hit2['end']: #hit1 starts within hit2
overlap = 1
if hit1['end'] >= hit2['start'] and hit1['end'] <= hit2['end']: #hit1 ends within hit2
overlap = 1
if hit1['start'] <= hit2['start'] and hit1['end'] >= hit2['end']: #hit1 starts before hit2 and ends after
overlap = 1
return overlap
def orf_integrity(seq):
orf = ''
if len(seq) % 3 != 0:
orf = "Frameshift"
else:
aa_seq = seq.translate()
if aa_seq[0] != 'M':
orf = "C-terminal fragment"
end = aa_seq[-1]
if end != '*':
if orf:
orf = "Internal fragment"
else:
orf = "N-terminal fragment"
if len(seq) % 3 != 0:
orf = "Frameshift"
if seq[:-1].find("*") > -1:
orf = "Pseudogene"
return orf
if __name__ == "__main__":
main(sys.argv[1:])