-
Notifications
You must be signed in to change notification settings - Fork 0
/
iden_div.py
185 lines (159 loc) · 6.18 KB
/
iden_div.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/python
# filename: iden_div.py
###########################################################################
#
# Copyright (c) 2013 Bryan Briney. All rights reserved.
#
# @version: 1.0.0
# @author: Bryan Briney
# @license: MIT (http://opensource.org/licenses/MIT)
#
###########################################################################
import os
import argparse
from pymongo import MongoClient
from Bio import pairwise2, SeqIO
from multiprocessing import Pool, cpu_count
import matplotlib as mpl
mpl.use('pdf')
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser("For a MongoDB collection, plots the germline divergence against the sequence identity to a given 'subject' sequence.")
parser.add_argument('-d', '--database', dest='db', required=True, help="Name of the MongoDB database to query. Required")
parser.add_argument('-c', '--collection', dest='collection', default=None, help="Name of the MongoDB collection to query. If not provided, all collections in the given database will be processed iteratively.")
parser.add_argument('-o', '--output', dest='output', required=True, help="Output directory figure files. The figure file(s) will be 'output/<db>_<collection>_<standard>.pdf'. Required")
parser.add_argument('-i', '--ip', dest='ip', default='localhost', help="The IP address for the MongoDB server. Defaults to 'localhost'.")
parser.add_argument('-p', '--port', dest='port', default=27017, help="The port for the MongoDB server. Defaults to '27017'.")
parser.add_argument('-s', '--standard', dest='standard', required=True, help='Path to a file containing the standard sequence(s) for which identity/divergence will be calculated, in FASTA format. All sequences in the standard file will iteratively processed. Required')
parser.add_argument('-x', '--chain', dest='chain', default='heavy', choices=['heavy', 'kappa', 'lambda', 'light'], help="The chain type of the subject sequence. Options are 'heavy', 'kappa', 'lambda' and 'light'. Default is 'heavy'.")
parser.add_argument('-n', '--no_update', dest='no_update', action='store_true', default=False, help="Does not update the MongoDB with iden_div info. Can save some time if the idenentity calculations aren't needed again.")
args = parser.parse_args()
def get_standards():
standards = []
for s in SeqIO.parse(open(args.standard, 'r'), 'fasta'):
standards.append([s.id, str(s.seq)])
return standards
def get_collections():
if args.collection:
conn = MongoClient(args.ip, args.port)
db = conn[args.db]
subjects = db.collection_names()
subjects.remove('system.indexes')
return sorted(subjects)
return [args.collection,]
def get_chain():
if args.chain == 'light':
return ['kappa', 'lambda']
return [args.chain,]
def query(collection):
conn = MongoClient(args.ip, args.port)
db = conn[args.db]
coll = db[collection]
chain = get_chain()
print_query_info()
results = coll.find({'chain': {'$in': chain}},{'_id': 0, 'seq_id': 1, 'nt_identity.v': 1, 'vdj_aa': 1})
output = []
for r in results:
output.append([r['nt_identity']['v'], r['vdj_aa'], r['seq_id']])
return output
def update_db(standard, scores, collection):
conn = MongoClient(args.ip, args.port, max_pool_size=1000)
db = conn[args.db]
coll = db[collection]
print_update_info()
for score in scores:
coll.find_and_modify(query={'seq_id': score[2]}, update={'$set': {'iden_div': {standard.lower(): float(score[1])}}})
print_done()
def identity(standard, seqs):
global scores
scores = []
print_single_standard(standard)
pool = Pool(processes=cpu_count())
for seq in seqs:
pool.apply_async(do_alignment, args=(seq,standard[1]), callback=log_result)
pool.close()
pool.join()
print_done()
return standard[0]
def do_alignment(seq, standard):
identity = seq[0]
sequence = seq[1]
seq_id = seq[2]
score = pairwise2.align.globalxx(sequence, standard, one_alignment_only=1, score_only=1)
norm_score = 100 * float(score) / max(len(sequence), len(standard))
output = [identity, norm_score, seq_id]
return output
def log_result(result):
scores.append(result)
def make_figure(standard_id, scores, collection):
print_fig_info()
fig_file = os.path.join(args.output, '{0}_{1}_{2}.pdf'.format(args.db, collection, standard_id))
x = [100.0 - s[0] for s in scores]
y = [s[1] for s in scores]
xmin = min(x)
xmax = max(x)
ymin = min(y)
# ymax = max(y)
# plot params
plt.subplots_adjust(hspace=0.95)
plt.subplot(111)
plt.hexbin(x, y, bins='log', cmap=mpl.cm.jet, mincnt=2, gridsize=100)
plt.title(standard_id, fontsize=18)
# set and label axes
plt.axis([xmin-2, xmax+2, ymin-2, 102])
# plt.gca().invert_xaxis()
plt.xlabel('Germline divergence')
plt.ylabel('{0} identity'.format(standard_id))
# make and label the colorbar
cb = plt.colorbar()
cb.set_label('Sequence count (log10)', labelpad=10)
# save figure and close
plt.savefig(fig_file)
plt.close()
print_done()
def print_standards_info(standards):
print ''
print ''
print 'Found {} standard sequence(s):'.format(len(standards))
print ', '.join([s[0] for s in standards])
def print_collections_info(collections):
print ''
print 'Found {} collection(s):'.format(len(collections))
print ', '.join(collections)
def print_single_standard(standard):
print ''
print 'Standard ID: {}'.format(standard[0])
print 'Calculating pairwise identities...'
def print_single_collection(collection):
print ''
print ''
print '----------------------------------------'
print 'Collection: {}'.format(collection)
print '----------------------------------------'
print ''
def print_query_info():
print ''
print 'Querying for comparison sequences...'
def print_fig_info():
print ''
print 'Making the identity/divergence figure...'
def print_update_info():
print ''
print 'Updating the MongoDB database with identity scores...'
def print_done():
print 'Done.'
def main():
standards = get_standards()
print_standards_info(standards)
collections = get_collections()
print_collections_info(collections)
for collection in collections:
print_single_collection(collection)
seqs = query(collection)
for standard in standards:
standard_id = identity(standard, seqs)
make_figure(standard_id, scores, collection)
if not args.no_update:
update_db(standard_id, scores, collection)
if __name__ == '__main__':
scores = []
main()