forked from BD2KGenomics/hgvm
/
clusterGraphs.py
executable file
·373 lines (308 loc) · 12.6 KB
/
clusterGraphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python2.7
"""
Cluster some graphs (using vg compare for pairwise distances) based on their similarity.
Current implementation : upgma tree using Jaccard distance matrix
"""
import argparse, sys, os, os.path, random, subprocess, shutil, itertools, glob
import doctest, re, json, collections, time, timeit, string, math
from Bio.Phylo.TreeConstruction import _DistanceMatrix, DistanceTreeConstructor
from Bio import Phylo
import matplotlib
matplotlib.use('Agg')
import pylab
import networkx as nx
from collections import defaultdict
from toil.job import Job
from toillib import RealTimeLogger, robust_makedirs
from heatmap import plotHeatMap
from callVariants import alignment_sample_tag
def parse_args(args):
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
# Add the Toil options so the job store is the first argument
Job.Runner.addToilOptions(parser)
# General options
parser.add_argument("graphs", nargs="+",
help="other graph(s) to compare to baseline")
parser.add_argument("out_dir", type=str,
help="directory to which results will be written.")
parser.add_argument("--kmer", type=int, default=27,
help="kmer size for comparison")
parser.add_argument("--edge_max", type=int, default=7,
help="edge-max parameter for vg kmer index")
parser.add_argument("--overwrite", action="store_true", default=False,
help="overwrite existing files")
parser.add_argument("--vg_cores", type=int, default=1,
help="number of cores to give to vg commands")
parser.add_argument("--avg_samples", action="store_true", default=False,
help="Average samples into mean value")
parser.add_argument("--dir_tag", action="store_true", default=False,
help="Use directory of graph as name tag")
parser.add_argument("--orig_tag", type=str, default="graphs",
help="When dir_tag used, change this tag to original")
parser.add_argument("--only_summary", action="store_true", default=False,
help="Only generate summary output. Do not do any"
" compute")
args = args[1:]
return parser.parse_args(args)
def index_path(graph, options):
""" get the path of the index given the graph
"""
return graph + ".index"
def compute_kmer_index(job, graph, options):
""" run vg index (if necessary) and vg compare on the input
vg indexes are just created in place, ie same dir as graph,
so need to have write permission there
"""
out_index_path = index_path(graph, options)
do_index = options.overwrite or not os.path.exists(out_index_path)
index_opts = "-s -k {} -t {}".format(options.kmer, options.vg_cores)
if options.edge_max > 0:
index_opts += " -e {}".format(options.edge_max)
if do_index:
os.system("vg index {} {}".format(index_opts, graph))
def dir_tag(graph, options):
""" optionally use directory for unique prefix
"""
if not options.dir_tag:
return ""
tag = graph.split("/")[-2] + "_"
if tag == options.orig_tag + "_":
tag = "original_"
return tag
def comp_path(graph1, graph2, options):
""" get the path for json output of vg compare
"""
name1 = dir_tag(graph1, options) + os.path.splitext(os.path.basename(graph1))[0]
name2 = dir_tag(graph2, options) + os.path.splitext(os.path.basename(graph2))[0]
return os.path.join(options.out_dir, "compare", name1 + "_vs_" + name2 + ".json")
def mat_path(options):
""" get the path of the distance matrix
"""
return os.path.join(options.out_dir, "distmat.tsv")
def tree_path(options):
""" path for newick tree
"""
return os.path.join(options.out_dir, "tree.newick")
def heatmap_path(options, tag=""):
""" path for heatmap
"""
return os.path.join(options.out_dir, "heatmap{}.pdf".format(tag))
def draw_len(weight):
""" actual weights are between 0 and 1 but vary by many orders of
magnitude. try to map them into something for graphviz edge length hint
"""
if weight < 0.0001:
return .5
elif weight < 0.001:
return .75
elif weight < 0.01:
return 1.
elif weight < 0.1:
return 1.25
elif weight < 0.2:
return 1.6
else:
return 1.6 + weight
def compute_matrix(options):
""" make a distance matrix (dictionary), also write it to file
"""
def label_fn(graph):
if options.avg_samples:
# ex: NA3453456_agumented.vg -> augmented
label = "".join(os.path.splitext(os.path.basename(graph))[0].split("_")[1:])
if label == "":
label = "".join(os.path.splitext(os.path.basename(graph))[0].split("_")[0])
toks = label.split("-")
label = toks[0]
if label == "debruijn":
label += "-{}".format(toks[-1])
label = dir_tag(graph, options) + label
# hack (original_cactus -> cactus_original)
if label.split("_")[0] == "original" and len(label.split("_")) > 1:
label = "_".join(label.split("_")[1:]) + "_" + label.split("_")[0]
assert len(label) > 0
return label
else:
return dir_tag(graph, options) + os.path.splitext(os.path.basename(graph))[0]
# make empty distance matrix and counts table (for mean)
mat = dict()
counts = dict()
for graph in options.graphs:
mat[label_fn(graph)] = defaultdict(float)
counts[label_fn(graph)] = defaultdict(float)
# fill the matrix, summing if two graphs map to same label
for graph1 in options.graphs:
for graph2 in options.graphs:
if graph1 <= graph2:
jpath = comp_path(graph1, graph2, options)
jaccard = -1.
with open(jpath) as f:
j = json.loads(f.read())
if float(j["union"]) == 0:
jaccard = 2.
else:
jaccard = float(j["intersection"]) / float(j["union"])
mat[label_fn(graph1)][label_fn(graph2)] += 1. - jaccard
mat[label_fn(graph2)][label_fn(graph1)] += 1. - jaccard
counts[label_fn(graph1)][label_fn(graph2)] += 1.
counts[label_fn(graph2)][label_fn(graph1)] += 1.
# divide by counts to get mean
for graph1 in map(label_fn, options.graphs):
for graph2 in map(label_fn, options.graphs):
mat[graph1][graph2] /= counts[graph1][graph2]
return mat, list(set(map(label_fn, options.graphs)))
def compute_tree(options, mat, names):
""" make upgma hierarchical clustering and write it as png and
graphviz dot
"""
# oops, convert to biopython matrix
matrix = []
for i in xrange(len(names)):
row = []
for j in xrange(i + 1):
# tree constructor writes 0-distances as 1s for some reason
# so we hack around here
val = float(mat[names[i]][names[j]])
if val == 0.:
val = 1e-10
elif val == 1.:
val = 1.1
row.append(val)
matrix.append(row)
dm = _DistanceMatrix(names, matrix)
# upgma tree
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
robust_makedirs(os.path.dirname(tree_path(options)))
Phylo.write(tree, tree_path(options), "newick")
# png tree -- note : doesn't work in toil
def f(x):
if "Inner" in str(x):
return ""
else:
return x
Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10)
pylab.savefig(tree_path(options).replace("newick", "png"))
# graphviz
# get networkx graph
nxgraph = Phylo.to_networkx(tree)
# make undirected
nxgraph = nx.Graph(nxgraph)
# push names to name labels
nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label")
for node_id in nxgraph.nodes():
node = nxgraph.node[node_id]
if "Inner" in str(node["label"]):
node["label"] = "\"\""
node["width"] = 0.001
node["height"] = 0.001
else:
node["fontsize"] = 18
for edge_id in nxgraph.edges():
edge = nxgraph.edge[edge_id[0]][edge_id[1]]
# in graphviz, weight means something else, so make it a label
weight = float(edge["weight"])
# undo hack from above
if weight > 1:
weight = 1.
if weight <= 1e-10 or weight == 1.:
weight = 0.
edge["weight"] = None
edge["label"] = "{0:.3g}".format(float(weight) * 100.)
edge["fontsize"] = 14
edge["len"] = draw_len(weight)
nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
def compute_heatmap(options, mat, names):
""" make a pdf heatmap out of the matrix
"""
array_mat = []
for graph1 in names:
array_mat.append([])
for graph2 in names:
array_mat[-1].append(mat[graph1][graph2])
plotHeatMap(array_mat, names, names,
heatmap_path(options),
leftTree=True,
topTree=True,
logNorm=False)
plotHeatMap(array_mat, names, names,
heatmap_path(options, "_log"),
leftTree=True,
topTree=True,
logNorm=True)
plotHeatMap(array_mat, names, names,
heatmap_path(options, "_vm1"),
leftTree=True,
topTree=True,
logNorm=False,
vmax=1.0)
plotHeatMap(array_mat, names, names,
heatmap_path(options, "_log_vm1"),
leftTree=True,
topTree=True,
logNorm=True,
vmax=1.0)
def cluster_comparisons(options):
""" write a (tsv) distance matrix
a graphviz dot upgma tree (and png)
a heatmap (pdf)
all based on distance = 1 - jaccard index
"""
mat, names = compute_matrix(options)
compute_tree(options, mat, names)
compute_heatmap(options, mat, names)
def compute_kmer_comparison(job, graph1, graph2, options):
""" run vg compare between two graphs
"""
out_path = comp_path(graph1, graph2, options)
graph1_index_path = index_path(graph1, options)
assert os.path.exists(graph1_index_path)
graph2_index_path = index_path(graph2, options)
assert os.path.exists(graph2_index_path)
do_comp = options.overwrite or not os.path.exists(out_path)
if do_comp:
robust_makedirs(os.path.dirname(out_path))
os.system("vg compare {} {} -t {} > {}".format(graph1, graph2,
min(options.vg_cores, 2), out_path))
def compute_comparisons(job, options):
""" run vg compare in parallel on all the graphs,
outputting a json file for each
"""
for graph1 in options.graphs:
for graph2 in options.graphs:
if graph1 <= graph2:
job.addChildJobFn(compute_kmer_comparison, graph1, graph2, options,
cores=min(options.vg_cores, 2))
def compute_kmer_indexes(job, options):
""" run everything (root toil job)
first all indexes are computed,
then all comparisons (follow on)
then summary (follow on of that)
"""
# do all the indexes
for graph in options.graphs:
job.addChildJobFn(compute_kmer_index, graph, options, cores=options.vg_cores)
# do the comparisons
job.addFollowOnJobFn(compute_comparisons, options, cores=1)
def main(args):
options = parse_args(args)
RealTimeLogger.start_master()
for graph in options.graphs:
if os.path.splitext(graph)[1] != ".vg":
raise RuntimeError("Input graphs expected to have .vg extension")
# Make a root job
root_job = Job.wrapJobFn(compute_kmer_indexes, options,
cores=1, memory="2G", disk=0)
# Run it and see how many jobs fail
if not options.only_summary:
failed_jobs = Job.Runner.startToil(root_job, options)
else:
failed_jobs = 0
if failed_jobs > 0:
raise Exception("{} jobs failed!".format(failed_jobs))
RealTimeLogger.stop_master()
# Do the drawing outside toil to get around weird import problems
cluster_comparisons(options)
if __name__ == "__main__" :
sys.exit(main(sys.argv))