/
fasta_parser.py
executable file
·208 lines (182 loc) · 5.91 KB
/
fasta_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/python
#filename: pdbminer.py
#####################################################################
# Search the protein sequence fasta file
# This code requires working biopython installed
# written by Bilal Nizami
#
# UKZN, Durban
# 2016
#
#
#####################################################################
#
# import functions
from __future__ import print_function
from optparse import OptionParser
from Bio import SeqIO
import sys
import re
from Bio.PDB import *
## Welcome message
print ('\n\tPDBMINER\n', 'written by Bilal Nizami\n', 'UKZN, Durban 2016\n\n\n', end="")
#==============================================================================
# Setting the options
#==============================================================================
parser = OptionParser("Usage: seqsearch.py -i --fasta fasta file name ")
parser.add_option("-i", "--fasta", type='string', dest="fasta",
help="A fasta file having amino acid sequence")
parser.add_option("-o", "--out", type='string', dest="output",
help="output text file")
parser.add_option("-p", "--path", type='string', dest="path",
help="string for path to the pdb files, it will be used for accesing the pdb file of parsed \
pdbid")
(options, args) = parser.parse_args()
#====================================================================
# if no arguments are passed
if options.fasta is None: # where fasta is obviously your required option
parser.print_help()
sys.exit(1)
#===========================================================================
#
# function to get the pdbid from fasta file
#===========================================================================
def fasta_get_pdbid(filename):
"fasta_get_pdbid() parse the pdbid from single fasta file having multiple sequences"
fastaf = []
pdbid = []
fp = open(filename, 'r')
fastaf = fp.read()
fastaf = fastaf.split('>')
fastaf = fastaf[1:len(fastaf)] ## knock off 0th index from list, this is done becuase preveious line of split insert a null at index 0
for line in fastaf:
line = line[0:4]
pdbid.append(line)
fp.close()
#print (pdbid)
return pdbid;
#===========================================================================
# read the single fasta file containing the multiple sequences. #
pdb = []
fastafile = ''
seq = ''
aacid = []
A = R = N = D = B = C = E = Q = Z = G = H = I = L = K = M = F = P = S = T = W = Y = V = 0
#aminoacid = ['A', 'R', 'N', 'D', 'B', 'C', 'E', 'Q', 'Z', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
pdb_id = []
pdb_id_chain = []
pdbid = []
pdbchain = []
count = []
counter=0
sf = open(options.fasta, "rU")
pdbid = fasta_get_pdbid(options.fasta)
#print (pdbid)
#for pdb in pdbid:
for p in SeqIO.parse(sf, "fasta"): ## takes a file handle and format name, and returns a SeqRecord iterator using biopython SeqIO
#print (p.id)
#pdb_id_chain = re.findall("\d.{3}:[A-Z]", p.id)
#print (str(pdb_id_chain))
#pdb_id = re.findall("\d.{3}", str(pdb_id_chain))
#print (re.findall("\d.{3}", pdb_id[0]), end='')
#print ('PDBID"\t', pdb_id[0], end='')
#print ('PDBID and chain:\t', pdb_id_chain)
#pdbchain.append(pdb_id_chain)
#pdbid.append(pdb_id[0])
aacid = list(p.seq)
for aa in aacid:
if aa == 'A':
A=A+1
elif aa == 'R':
R=R+1
elif aa == 'N':
N=N+1
elif aa == 'D':
D=D+1
elif aa == 'B':
B=B+1
elif aa == 'C':
C=C+1
elif aa == 'E':
E=E+1
elif aa == 'Q':
Q=Q+1
elif aa == 'Z':
Z=Z+1
elif aa == 'G':
G=G+1
elif aa == 'H':
H=H+1
elif aa == 'I':
I=I+1
elif aa == 'L':
L=L+1
elif aa == 'K':
K=K+1
elif aa == 'M':
M=M+1
elif aa == 'F':
F=F+1
elif aa == 'P':
P=P+1
elif aa == 'S':
S=S+1
elif aa == 'T':
T=T+1
elif aa == 'W':
W=W+1
elif aa == 'V':
V=V+1
elif aa == 'Y':
Y=Y+1
counter=counter+1
#print ('. ', end="")
print ('total', counter, 'processed')
#print ('PDBID "\t', pdbid, pdbchain, end='')
sf.close()
#=============================================================================
# open Summary file and print into it
sf = open(options.output, 'w')
print('Amino Acid,', 'Count,', file=sf)
print('A\t', A, '\nR\t', R, '\nN\t', N, '\nD\t', D, '\nB\t', B, '\nC\t', C, '\nE\t', E, '\nQ\t', Q, '\nZ\t', Z, '\nG\t', G, '\nH\t', H, '\nI\t', I, '\nL\t', L, '\nK\t', K, '\nM\t', M, '\nF\t', F, '\nP\t', P, '\nS\t', S, '\nT\t', T, '\nW\t', W, '\nY\t', Y, '\nV\t', V, file=sf)
sf.close()
#===========================================================================================================
#
# sort and store only unique pdb ids, there might be redundent parsed pdbids due to seperate entry in input
# fasta file for multiple chains of a pdbid. it uses built in set() method
#
pdbid = list(set(pdbid)) # remove duplicates pdbids
#print (pdbid)
## opening file for dumping resolutions
sf = open('res.txt', 'w')
if options.path != None:
print ('Resolution', file=sf)
resolution = ''
pdb_path = []
parser = PDBParser()
for pdb in pdbid:
try:
pdb_dir = pdb[1:3]
pdb_dir = pdb_dir.lower()
pdb = pdb.lower() ## lower case, becuase last character fo pdbid from fasta file is in uppercase
#print (pdb)
pdb_path1 = str(options.path) + pdb_dir + '/pdb' + str(pdb) + '.ent' ## full path to the pdb.ent file
pdb_path.append(pdb_path1)
structure = parser.get_structure('PHA-L', pdb_path1)
resolution = str(structure.header['resolution'])
print ('for ', pdb)
print (resolution, file=sf)
#print (pdb_path)
except IOError:
print ('Can not find', pdb_path1, ' file')
#for pdb in pdb_path:
# try:
# #print (pdb)
# structure = parser.get_structure('PHA-L', pdb)
# resolution = str(structure.header['resolution'])
# print ('for ', pdb)
# print (resolution, file=sf)
# #print (resolution)
# except IOError:
# print ('Can not find', pdb ' file')
sf.close()