forked from bolduc/blast2network
/
BLAST2Network.py
806 lines (541 loc) · 27.1 KB
/
BLAST2Network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
#!/usr/bin/env python
__author__ = 'Benjamin Bolduc'
import sys
import os
import optparse
import math
import csv
import datetime
from collections import OrderedDict
from pprint import pprint
p = optparse.OptionParser(description="""A simple script that creates a graph (*.graphml format) from a BLAST file and
applies a community detection algorithm to identify groups of highly-related sequences. Annotation information can also
be added as well, allowing high-order organization and graphics in a graphml-compatible graph visualization software.
A number of options exist to filter the BLAST HSPs going into the network, thresholds for the number of minimum members
within a group to be retained in the graph, as well as read-based analytics (using *.ace formatted files). Limited data
use visualization is also possible.""")
g = optparse.OptionGroup(p, "General Options")
g.add_option("--input_fn", dest="input_fn", metavar="FILENAME",
help="""BLAST file in pairwise format (out_fmt 0 option).""")
g.add_option("--annotation_fn", dest="annotation_fn", metavar="FILENAME",
help="""Tab-delimited file with header columns as arguments for annotation and the first row as the
sequence/contig/node's name. This name MUST MATCH the sequence name in whatever fasta file was used to
create the network from. EX:
Name Length Group Time Spacer Virus Type Fraction
Contig5 2554 8 Feb Yes Archaeal 17
Contig2 3221 6 Dec No Bacterial 19""")
g.add_option("--output_fn", dest="output_fn", metavar="FILENAME", default="Output", help="""Output prefix.""")
p.add_option_group(g)
g = optparse.OptionGroup(p, "BLAST Options")
g.add_option("--evalue", dest="evalue", type="float", metavar="FLOAT", default=1E-30,
help="""E-value cutoff for sequence selection (optional; default: %default).""")
g.add_option("--hsp_to_query_ratio", dest="hsp_to_query_ratio", type="float", metavar="FLOAT", default=0.0,
help="""Percent of query that belongs to the HSP. If alignment/HSP is 200 bp and query length is 400,
the ratio is 0.5 (optional; default: %default).""")
g.add_option("--hsp_identity", dest="hsp_identity", type="float", metavar="FLOAT", default=0.75,
help="""Percent identity cutoff. This allows a minimum % identity that an alignment must be to be included.
(optional; default: %default).""")
g.add_option("--hsp_length", dest="hsp_length", type="int", metavar="INTEGER", default=200,
help="""HSP length cutoff. This allows a minimum length that an alignment must be to be included.
(optional; default: %default).""")
p.add_option_group(g)
g = optparse.OptionGroup(p, "Network Options")
g.add_option("--num_inputs", dest="num_inputs", type="int", metavar="INTEGER",
help="""The number of sequences in the original BLAST. This is useful for when the BLAST file is too large
to be easily parsed.""")
g.add_option("--partitioner", dest = "partitioner", metavar = "PYTHON CODE", default = "%",
help = """Python code to further define a separating characteristic (i.e. site, date, etc) that can be used
during graph visualization to color-code nodes. For example, "%.split('|')[1].split('_')[1]" will identify
0808 for contig01129|NL10_0808_vDNA_MSU_WGA; '%' will be replaced by the sequence name. An alternative is to
use the annotation file, whose attributes will be passed to the created nodes (optional; Default: %default")""")
g.add_option("--use_reads", dest="use_reads", action="store_true", default=False,
help="""Enables the use of read counting using 454 ACE files.""")
g.add_option("--read_fn", dest="read_fn", metavar="FILENAME",
help="""Ace file with read-to-contig mapping information.""")
g.add_option("--read_community_cutoff", dest="read_co_cutoff", type="int", metavar="INTEGER", default=1000,
help="""Sets a cutoff for the minimum number of reads comprising the contigs of a community must have to be retained.""")
g.add_option("--contig_community_cutoff", dest="contig_co_cutoff", type="int", metavar="INTEGER", default=50,
help="""Sets a cutoff for the minimum number of members a community must have to be retained.""")
g.add_option("--remove_tmp", dest="tmp_remove", action="store_true", default=False,
help="""Removes initial network created prior to any manipulation. This is useful for when memory is
constrained and analysis crashes. This REMOVES that file at the end of a successful run.""")
p.add_option_group(g)
g = optparse.OptionGroup(p, "Post-Visualization Options")
p.add_option("--membership_per_community", dest="mpc_fig", action="store_true", default=False,
help="Will count the members in each community and generate a figure.")
p.add_option("--data_represented", dest="dr_fig", action="store_true", default=False,
help="Will calculate the data represented and generate a figure.")
(p, a) = p.parse_args()
def error(msg):
print >> sys.stderr, "ERROR: {}".format(msg)
sys.exit(1)
if (p.input_fn == None):
error("An input sequence filename is required.")
try:
with open(p.input_fn): pass
except IOError:
error("Input file {} not found.".format(p.input_fn))
try:
from blastorage import Storage
except ImportError:
error("The BlaSTorage library is required and was not found. Available @ http://biowiki.crs4.it/biowiki/MassimilianoOrsini")
if p.mpc_fig or p.dr_fig:
try:
import matplotlib
except ImportError:
error("The matplotlib library is required and was not found. Available @ http://matplotlib.org/")
try:
import numpy as np
except ImportError:
error("The numpy library is required and was not found. Available @ ")
try:
import networkx as nx
except ImportError:
error("The NetworkX library is required and was not found. Available @ https://networkx.github.io/")
try:
import community
except ImportError:
error("The NetworkX-compatible Louvain community detection algorithm is required and was not found."
"It is available at https://bitbucket.org/taynaud/python-louvain")
if (p.use_reads):
if (p.read_fn == None):
error("An read filename is required.")
try:
with open(p.read_fn):
pass
except IOError:
error("Read file {} not found.".format(p.read_fn))
def logger(logfile_handle, text):
logfile_handle.write("{} {}\n".format(datetime.datetime.now(), text))
logfile_handle.flush()
print text
logfile_fn = "{}.log".format(p.output_fn)
logfile_fh = open(logfile_fn, 'a', 0) # Set unbuffered, so info written to disk immediately
diG = nx.DiGraph()
node_annotation_dict = {}
if p.annotation_fn:
with open(p.annotation_fn, 'rU') as annotation_fh:
logger(logfile_fh, "Reading Annotation File.")
line_count = 0
header = []
for lines_of_data in annotation_fh:
if line_count == 0: # Hack to separate 1st line of file (annotation names) from rest of file
header = lines_of_data.strip().split('\t')
line_count += 1
else:
data = lines_of_data.strip().split('\t')
node_annotation_dict[data[0]] = dict(zip(header[1:], data[1:])) # With header columns as keys, rows as values
# Header should stay the same, with lengths, groups, etc always being keys and the data (changing) as values
db = p.input_fn.rsplit('.', 1)[0]+'.db'
infile = p.input_fn
graph_tmp_out = "{0}.tmp.graphml".format(p.output_fn)
uGraph = ''
input_queries = 0
non_singleton_graph_node_count = 0
NoMatches = 0
NoMatches_fh = open(p.output_fn + '-NoMatches.list', 'a', 0)
if os.path.isfile("{}.graphml".format(p.output_fn)):
print "End file {}.graphml already exists. It is safe to delete this file.".format(p.output_fn)
else:
if os.path.isfile(graph_tmp_out):
logger(logfile_fh, "Graph file found! Jumpstarting Analysis.")
uGraph = nx.read_graphml(graph_tmp_out)
logger(logfile_fh, "Graph file loaded.")
non_singleton_graph_node_count = uGraph.number_of_nodes() # Update nodes if reloading graph file
if not os.path.isfile(graph_tmp_out):
logger(logfile_fh, "Processing main BLAST file.")
st = Storage(db, infile)
st.store()
api = st()
logger(logfile_fh, "BLAST file conversion finished. Parsing for network.")
blast_queries = api.getQueriesNumber()
input_queries = blast_queries # Number of reference sequences should/cold be subtracted
logger(logfile_fh, "There are {} blast queries to parse.".format(blast_queries))
n = 0
for blast in api: # One Blast result for each query
query = str(blast.getQueryTitle()).strip()
query_len = int(blast.getQueryLength())
query_dict = {
"id": query,
"length": query_len,
"size": 1,
}
if query in node_annotation_dict:
query_dict.update(node_annotation_dict[query])
# Round result, multiple objects for psiblast... apparently only 1 (stills needs iterated through though)
for iteration in blast:
if not iteration.found: # Another way of saying "if cond is False"
NoMatches += 1
NoMatches_fh.write(query + '\n')
NoMatches_fh.flush()
else:
# All sequences will get 'a chance.' If good match, then edge will connect.
# If no good match (i.e. fails >= 1 conditions), will be removed during singleton analysis
diG.add_node(query, query_dict) # Pass dict to networkx
for alignment in iteration: # Alignment, one for each subject
target = str(alignment.getSubjectTitle()).strip()
target_len = int(alignment.getSubjectLength()) # True target length, i.e. the contig size
target_dict = {
"id": target,
"length": query_len,
"size": 1,
}
if target in node_annotation_dict:
target_dict.update(node_annotation_dict[target])
for hsp in alignment:
hsp.setGradesDict()
evalue = hsp.getEvalueAsFloat()
identity = hsp.getIdentitiesAsPercentage()
identities = hsp.getIdentitiesAsAbsoluteValue() # Int
hsp_length = int(hsp.getIdentitiesExpression().split(' ')[0].split('/')[1]) # Hack to get hsp length, Int
hsp_identity = float(identities) / hsp_length # Percent identity over HSP length
total_alignment_length = hsp_length / target_len # Alignment length over target
hsp_to_query_ratio = hsp_length / (query_len + .0) # Can be greater than 1
if query == target:
continue # At the very least, it's a self-match, at worst, there's already a node in the network
if (evalue <= p.evalue) and (hsp_to_query_ratio >= p.hsp_to_query_ratio) \
and (hsp_identity >= p.hsp_identity) and (hsp_length >= p.hsp_length):
# Going to add a node anyway, might as well add it once
diG.add_node(target, target_dict)
if evalue == 0:
diG.add_edge(query, target, weight=500.0, length=hsp_length, identity=hsp_identity, label='BLAST')
if evalue != 0:
expect = math.log(evalue) * -1
diG.add_edge(query, target, weight=expect, length=hsp_length, identity=hsp_identity, label='BLAST')
n += 1
sys.stdout.write("\r{0:.4f}".format((float(n) / blast_queries) * 100))
sys.stdout.flush()
logger(logfile_fh, "\nThere were {} queries without a match.".format(NoMatches))
logger(logfile_fh, "Finished processing BLAST file.")
# Remove nodes with fewer than 1 edge in a directed graph
# Following speeds things up by removing nodes that are not relevant
# Any nodes with matches but not matches of sufficient strength
singles = [node for node, degree in diG.degree_iter() if degree == 0]
logger(logfile_fh, "There were {} queries that were singletons.".format(len(singles)))
diG.remove_nodes_from(singles)
Insufficient_Match_fh = open(p.output_fn + '-PoorMatches.list', 'w')
formatted_singles = "\n".join(singles)
print >> Insufficient_Match_fh, formatted_singles
Insufficient_Match_fh.close()
logger(logfile_fh, "Removed singletons.")
logger(logfile_fh, "Converting directed graph to undirected.")
# Convert graph to undirected - can't run community analysis without
uGraph = diG.to_undirected(diG)
non_singleton_graph_node_count = uGraph.number_of_nodes()
logger(logfile_fh, "Writing raw graphml file to disk.")
# In case downstream analysis fails... wont have to repeat
nx.write_graphml(uGraph, "{0}.tmp.graphml".format(p.output_fn))
logger(logfile_fh, "Attempting to close and remove BlaSTorage DB. If it fails, kill job and restart. It will resume."
" This is likely to happen if your BLAST file is >= 2 GB.")
st.close()
os.remove(db)
logger(logfile_fh, "Closing finished.")
# Sadly needed if graph BlaSTorage fails to close, halting the program. No other way to obtain the input queries unless
# the blast file is reloaded... so could either force user to supply the number, the fasta file used for the blast, or
# just grep -c '>' on the original fasta file and be done with it
if (p.num_inputs):
input_queries = int(p.num_inputs)
else:
if input_queries == 0:
error("There are no input queries or the number of input queries has not been specified. This can happen if"
" the BlaSTorage library failed previously.")
def find_partition(graph):
g = graph
partition = community.best_partition(g)
nx.set_node_attributes(g, 'orig_partition', partition)
return g, partition
logger(logfile_fh, "Identifying partitions...")
# Find communities
coGraph, parts = find_partition(uGraph)
new_parts = {}
cluster_membership = {}
out_part_mem_fh = open(p.output_fn + '-Partition-Membership.tab', 'w')
for sequences, partitions in parts.iteritems():
print >> out_part_mem_fh, '{0}\t{1}'.format(sequences, partitions)
if not partitions in new_parts.keys(): # The keys are partition numbers
new_parts[partitions] = 1
else:
new_parts[partitions] += 1
if not partitions in cluster_membership:
cluster_membership[partitions] = [sequences]
else:
cluster_membership[partitions].append(sequences)
out_part_mem_fh.close()
if p.partitioner != "%":
logger(logfile_fh, "Partitioner enabled due to non-default options.")
logger(logfile_fh, "Identifying fraction information...")
part_site_comp_fh = csv.writer(open(p.output_fn + '-Partition-Site-Composition.csv', 'w'))
partition_sites = {}
for contigs, partitions in parts.iteritems():
partition_num = str(partitions)
get_characteristic = eval("lambda x: " + p.partitioner.replace('%', 'x').replace("\\x", '%'))
characteristic = get_characteristic(contigs)
if not partition_num in partition_sites:
partition_sites[partition_num] = {}
if not characteristic in partition_sites[partition_num]:
partition_sites[partition_num][characteristic] = 0
partition_sites[partition_num][characteristic] += 1
characteristics = {}
for cluster_id in partition_sites:
for characteristic in partition_sites[cluster_id]:
characteristics[characteristic] = None
characteristics = sorted(characteristics)
part_site_comp_fh.writerow(["Cluster"] + characteristics + ["Total Contigs"])
for cluster_id in sorted(partition_sites):
row = [cluster_id]
total_contigs = sum(partition_sites[cluster_id].values())
for characteristic in characteristics:
row.append(partition_sites[cluster_id].get(characteristic, 0))
# If reference sequence present, remove count from total
if 'REF' in characteristic:
total_contigs -= partition_sites[cluster_id].get(characteristic, 0)
row.append(total_contigs)
part_site_comp_fh.writerow(row)
cutoff = []
if (p.use_reads):
print "Working on ace file {}".format(p.read_fn)
contig_read_dict = {}
contig_read_len_dict = {}
from Bio.Sequencing import Ace
with open(p.use_reads, 'rU') as ace_fh:
for contig in Ace.parse(ace_fh):
"""rd (reads) - read with name, sequence, etc
qa (read qual) - which parts used as consensus
ds - file name of read's chromatogram file
af - loc of read within contig
bs (base segment) - which read chosen at consensus at each pos
rt (transient read tags) - generated by crossmatch and phrap
ct (consensus tag)
wa (whole assembly tag) - hosts assembly program name, version, etc
wr
reads - info about read supporting ace contig
contig - holds info about contig from ace record"""
contig_name = "{}".format(contig.name) # contig00001
if not contig_name in contig_read_dict:
contig_read_dict[contig_name] = []
if not contig_name in contig_read_len_dict:
contig_read_len_dict[contig_name] = []
for read_id, read in enumerate(contig.reads):
# Using enumerate to give int because parsing through list of contig's reads. The list isn't necessarily in order
# meaning that parsing in one block won't correspond to to its location (in the list) in another block
read_name = read.rd.name.split('.')[0] # Functionally equivalent to contig.af[read_id].name
read_length = len(read.rd.sequence)
if not read_name in contig_read_dict[contig_name]:
contig_read_dict[contig_name].append(read_name)
contig_read_len_dict[contig_name].append(read_length)
cluster_membership = OrderedDict(sorted(cluster_membership.items(), key=lambda t: len(t[1]), reverse=True)) # Sorted by value
cluster_read_abundances = OrderedDict()
cluster_read_length_abundances = OrderedDict()
for cluster, contigs in cluster_membership.iteritems(): # Ordered above, but need another dict to keep it that way
# Might as well filter the list here and now. Later the reads *would of* been filtered, but that's pointless since
# N reads will never be sufficient to generate N contigs in order to be included in network
for contig in contigs:
try:
num_reads = len(contig_read_dict[contig]) # Get length of contig's read list
read_lengths = sum(contig_read_len_dict[contig]) # Get sum of all read lengths
except KeyError: # References will not be in the metagenome's ace files, obviously
print "Skipping {}".format(contig)
break
if not cluster in cluster_read_abundances:
cluster_read_abundances[cluster] = 0
if not cluster in cluster_read_length_abundances:
cluster_read_length_abundances[cluster] = 0
cluster_read_abundances[cluster] += num_reads
cluster_read_length_abundances[cluster] += read_lengths
out_part_size_fh = open(p.output_fn + '-Partition-Contig-Read-Sizes.tab', 'w')
for keys, values in new_parts.iteritems(): # partition: count
try:
cluster_reads = cluster_read_abundances[keys]
print >> out_part_size_fh, '{0}\t{1}\t{2}'.format(keys, values, cluster_reads) # print partition \t count
if values >= p.read_co_cutoff:
cutoff.append(keys)
except KeyError: # All clusters are in new_parts, not all clusters have sufficient
print "Unable to find {}. It has {} contigs".format(keys, values)
out_part_size_fh.close()
if p.partitioner != "%":
part_site_read_abund_fh = csv.writer(open(p.output_fn + '-Partition-Site-Read-Abundances.csv', 'w'))
partition_read_abundance_sites = {}
for contigs, partitions in parts.iteritems():
partition_num = str(partitions)
try:
num_reads = len(contig_read_dict[contigs])
#print contigs, partitions, num_reads
except KeyError: # References will not be in the metagenome's ace files, obviously
print "Skipping {}".format(contigs)
num_reads = 0
get_characteristic = eval("lambda x: " + p.partitioner.replace('%', 'x').replace("\\x", '%'))
characteristic = get_characteristic(contigs)
if not partition_num in partition_read_abundance_sites:
partition_read_abundance_sites[partition_num] = {}
if not characteristic in partition_read_abundance_sites[partition_num]:
partition_read_abundance_sites[partition_num][characteristic] = 0
partition_read_abundance_sites[partition_num][characteristic] += num_reads
characteristics = {}
for cluster_id in partition_read_abundance_sites:
for characteristic in partition_read_abundance_sites[cluster_id]:
characteristics[characteristic] = None
characteristics = sorted(characteristics)
part_site_read_abund_fh.writerow(["Cluster"] + characteristics + ["Total Reads", "Total Contigs"])
read_composition_array = []
for cluster_id in sorted(partition_read_abundance_sites):
row = [cluster_id]
total_reads = sum(partition_read_abundance_sites[cluster_id].values())
total_contigs = new_parts[int(cluster_id)]
list_to_append = []
for characteristic in characteristics:
row.append(partition_read_abundance_sites[cluster_id].get(characteristic, 0))
if total_contigs >= p.co_cutoff:
list_to_append.append(partition_read_abundance_sites[cluster_id].get(characteristic, 0))
row.append(total_reads)
row.append(total_contigs)
if list_to_append: # Dont want to add empty arrays
read_composition_array.append(list_to_append)
part_site_read_abund_fh.writerow(row)
# http://stackoverflow.com/questions/19872530/python-sort-lists-based-on-their-sum
read_composition_array.sort(key=sum) # So easy it makes me want to cry
# If not using reads
else:
out_part_size_fh = open(p.output_fn + '-Partition-Contig-Sizes.tab', 'w')
for keys, values in new_parts.iteritems(): # partition: count
print >> out_part_size_fh, '{0}\t{1}'.format(keys, values) # print partition \t count
if values >= p.contig_co_cutoff:
cutoff.append(keys)
logger(logfile_fh, "Removing clusters containing less than {} members".format(p.contig_co_cutoff))
# Create a subgraph that includes only those above a specified cutoff value
subGraph = coGraph.subgraph([n for n, attrdict in coGraph.node.items() if attrdict['orig_partition'] in cutoff])
logger(logfile_fh, "Writing final network to disk...")
nx.write_graphml(subGraph, "{0}.graphml".format(p.output_fn))
logger(logfile_fh, "Network written to disk")
if p.tmp_remove:
os.remove(graph_tmp_out)
logger(logfile_fh, "Removing temporary network file...")
if p.mpc_fig or p.dr_fig:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams
if p.mpc_fig:
logger(logfile_fh, "Generating Membership Per Community Figure")
rcParams['ytick.direction'] = 'out'
font = {'family': 'sans-serif',
'weight': 'normal',
'size': 10}
lines = {'linewidth': 0.5}
Data = [value for (key, value) in new_parts.iteritems() if key in cutoff]
Data.sort(reverse=True)
# Removes the "long tail" often associated with rare species
Data_NoFew = filter(lambda few: few >= 2, Data)
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(1, 1, 1)
N = len(Data_NoFew)
ind = np.arange(N)
width = 0.8
margin = 0.1
colors = []
for data in Data_NoFew:
if data >= p.contig_co_cutoff:
colors.append('b')
else:
colors.append('r')
ax.bar(ind + margin, Data_NoFew, width, color=colors, align='edge', alpha=.8, zorder=1, edgecolor="none", linewidth=0) # Removed label=
ax.set_ylabel('Members/Community')
ax.set_xlabel('Viral Group Rank')
ax.legend(loc='upper left', frameon=False)
fig.savefig(p.output_fn + "-Members-in-Each-Community.png", dpi=600, bbox_inches='tight')
if p.dr_fig:
logger(logfile_fh, "Generating NonSingleton Data Represented Figure.")
import matplotlib.cm as cm
from matplotlib.colors import Normalize
font = {'family': 'sans-serif',
'weight': 'normal',
'size': 10}
lines = {'linewidth': 0.5}
matplotlib.rc('font', **font)
matplotlib.rc('lines', **lines)
fig = plt.figure(figsize=(7, 4))
ax = fig.add_subplot(1, 1, 1)
# new_parts = partition#, counts
Data = [value for (key, value) in new_parts.iteritems()]
N = 15 # Change to 16 if including 16 cutoff range
ind = np.arange(N)
width = 0.8
Total_Contigs = int(input_queries)
# Will create a list with values of the counts of each of the partitions (done above)
Total_Contigs_In_Network_list = [value for value in Data if value >= 2]
Total_Contigs_In_Network = sum(Total_Contigs_In_Network_list)
Total_Contigs_In_Gt1 = non_singleton_graph_node_count
data_represented_membership_size_cutoffs = [2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 100, 200, 300, 400, 500]
Data_Represented = []
for cutoff in data_represented_membership_size_cutoffs:
Data_prerepresented = []
for community_sizes in Data:
if community_sizes >= cutoff:
Data_prerepresented.append(community_sizes)
# Calculate % of data represented by the new communities that are above the cutoff
to_append = (sum(Data_prerepresented) / (Total_Contigs_In_Gt1 + .0)) * 100
Data_Represented.append(to_append)
my_cmap = cm.get_cmap('jet')
my_norm = Normalize(vmin=min(Data_Represented), vmax=max(Data_Represented))
ax.bar(ind, Data_Represented, width, color=my_cmap(my_norm(Data_Represented)), align='edge', alpha=1, zorder=1)
xlabels = ["{0}".format(x) for x in data_represented_membership_size_cutoffs]
ax.set_xlabel('Members/Group')
ax.set_ylabel('Contigs Represented (%)')
ax.set_xlim(0)
ax.set_ylim(0)
ax.set_xticks(ind + (width / 2)) # ind+width
ax.set_xticklabels((xlabels))
xtickNames = ax.set_xticklabels(xlabels)
plt.setp(xtickNames, rotation=0, fontsize=10)
sm = plt.cm.ScalarMappable(cmap=my_cmap, norm=Normalize(vmin=min(Data_Represented), vmax=max(Data_Represented)))
sm._A = []
plt.colorbar(sm)
fig.savefig(p.output_fn + "-NoSingles-Contig-Data-Represented.png", dpi=600, bbox_inches='tight')
if p.dr_fig:
logger(logfile_fh, "Generating Total Data Represented Figure.")
import matplotlib.cm as cm
from matplotlib.colors import Normalize
font = {'family': 'sans-serif',
'weight': 'normal',
'size': 10}
lines = {'linewidth': 0.5}
matplotlib.rc('font', **font)
matplotlib.rc('lines', **lines)
fig = plt.figure(figsize=(7, 4))
ax = fig.add_subplot(1, 1, 1)
Data = [value for (key, value) in new_parts.iteritems()]
N = 15 # Change to 16 if including 16 cutoff range
ind = np.arange(N)
width = 0.8
Total_Contigs = int(input_queries) + .0
# Will create a list with values of the counts of each of the partitions (done above)
Total_Contigs_In_Network_list = [value for value in Data if value >= 1]
Total_Contigs_In_Network = sum(Total_Contigs_In_Network_list)
data_represented_membership_size_cutoffs = [2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 100, 200, 300, 400, 500]
Data_Represented = []
for cutoff in data_represented_membership_size_cutoffs:
Data_prerepresented = []
for community_sizes in Data:
if community_sizes >= cutoff:
Data_prerepresented.append(community_sizes)
# Calculate % of data represented by the new communities that are above the cutoff
to_append = (sum(Data_prerepresented) / (Total_Contigs + .0)) * 100
Data_Represented.append(to_append)
my_cmap = cm.get_cmap('jet')
my_norm = Normalize(vmin=min(Data_Represented), vmax=max(Data_Represented))
ax.bar(ind, Data_Represented, width, color=my_cmap(my_norm(Data_Represented)),
align='edge', alpha=1, zorder=1)
xlabels = ["{0}".format(x) for x in data_represented_membership_size_cutoffs]
ax.set_xlabel('Members/Group')
ax.set_ylabel('Contigs Represented (%)')
ax.set_xlim(0)
ax.set_ylim(0)
ax.set_xticks(ind + (width / 2)) # ind+width
ax.set_xticklabels(xlabels)
xtickNames = ax.set_xticklabels(xlabels)
plt.setp(xtickNames, rotation=0, fontsize=10)
sm = plt.cm.ScalarMappable(cmap=my_cmap, norm=Normalize(vmin=min(Data_Represented), vmax=max(Data_Represented)))
sm._A = []
plt.colorbar(sm)
fig.savefig(p.output_fn + "-Total-Data-Represented.png", dpi=600, bbox_inches='tight')
logger(logfile_fh, "Program Complete.")
logfile_fh.close()