forked from jgurtowski/ectools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
graphCellStats.py
114 lines (82 loc) · 3.28 KB
/
graphCellStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
#Takes as input schtats output files,
#one per SMRTcell and graphs them.
#file names are expected to be from the deplex
#script
import sys
from itertools import chain, imap, cycle
from operator import itemgetter
import math
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from numpy import arange
from args import parseArgs, getHelpStr, CLArgument
from nucio import fileIterator, lineItemIterator
from args import argflag
description = ("Usage: graphCellStats.py [options] title in.schtats [in2.schtats ...]\n\n"
"Graph SMRTCell Stats")
argument_list = [["lengreater", "lengreater", int, 10000,
("The y-axis will be of reads greater than this "
"argument. Make sure that all schatats outputs "
"have this increment: ex: #>10000 "
"Default: 10000 ")],
["counts", "counts", argflag, False,
("Graph counts instead of bases ie. number of reads")],
["out","out", str, "cellstats.pdf",
("Output filename. Default: 'cellstats.pdf'")]]
arguments = map(CLArgument._make, argument_list)
if not len(sys.argv) > 1:
sys.exit(getHelpStr(description, arguments) + "\n")
(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)
if not len(args_remaining) >= 1:
sys.exit(getHelpStr(description, arguments) + "\n")
title = args_remaining[0]
infiles = args_remaining[1:]
cellnames = map(lambda f : "_".join(f.split(".")[0].split("_")[:2]), infiles)
fit_gen = lambda filename : fileIterator(filename, lineItemIterator)
file_iterators = map(fit_gen, infiles)
def getBasesFromLineArr(arr):
if not bool(arr):
return
if arr[0].startswith("n="):
return arr[6].split("=")[1]
if arr[0].startswith("#>%d" % p_arg_map["lengreater"]):
return arr[1].split("=")[1]
def getCountsFromLineArr(arr):
if not bool(arr):
return
if arr[0].startswith("n="):
return arr[0].split("=")[1]
if arr[0].startswith("#>%d" % p_arg_map["lengreater"]):
return arr[0].split("=")[1]
intlog = lambda x : math.log(int(x))
data = []
dgetter = getCountsFromLineArr if p_arg_map["counts"] else getBasesFromLineArr
for cellname, it in zip(cellnames,file_iterators):
d = map(intlog,filter(bool,imap(dgetter, it)))
d.append(cellname)
data.append(d)
mpl.rc('xtick', labelsize=6)
mpl.rc('ytick', labelsize=6)
pp = PdfPages(p_arg_map["out"])
colors = cycle("bgrcmyk")
markers = "oooooooxxxxxxxx++++++++********ssssssssvvvvvvvv"
cellset = sorted(list(set(cellnames)))
cmap = dict(zip(cellset, zip(colors,markers)))
h = []
for cellgroup in cellset:
groupdata = filter(lambda x : x[2] == cellgroup, data)
(alld, dgreater, cells) = zip(*groupdata)
h.append(plt.scatter(alld, dgreater, marker=cmap[cellgroup][1], c=cmap[cellgroup][0]))
plt.legend(h,cellset, loc='upper left', fontsize=4, scatterpoints=1)
if p_arg_map["counts"]:
plt.xlabel("Log (Total Number of Reads)")
plt.ylabel("Log (Total Number of Reads > %d)" % p_arg_map["lengreater"])
else:
plt.xlabel("Log (Total Cell Bases)")
plt.ylabel("Log (Bases > %d )" % p_arg_map["lengreater"])
plt.suptitle(title)
plt.savefig(pp, format="pdf")
pp.close()