/
decomposeGBTracks.py
executable file
·381 lines (328 loc) · 16.6 KB
/
decomposeGBTracks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
#!/broad/software/free/Linux/redhat_5_x86_64/pkgs/python_2.7.1-sqlite3-rtrees/bin/python2.7
#doesn't work with:
#!/home/unix/cgdeboer/bin/python3
#import sys; sys.argv= "decomposeGBTracks.py -a ICA -i test_samples.txt -o test_ICA -c /home/unix/cgdeboer/genomes/sc/20110203_R64/chrom.sizes -x 2 -v -v -v".split();
import argparse
parser = argparse.ArgumentParser(description='This program takes GB tracks as input and decomposes them using either PCA, ICA, or NMF, outputting the components.')
parser.add_argument('-i',dest='inFP', metavar='<inFile>',help='Input file containing a list of files, one per line with the following format, tab separated: \n<ID>\t<filePath>\t<gaussianSmoothingSD>\t<defaultValue>\t<doLog?>\n where <doLog> is the number you want added to the data before log transform, or a negative number otherwise', required=True);
parser.add_argument('-c',dest='chrsFile', metavar='<chrom.sizes>',help='A chrom.sizes file contiaining the sizes for all the chromosomes you want output', required=True);
parser.add_argument('-o',dest='outFPre', metavar='<outFilePre>',help='Where to output results, prefix [default=stdout]\n Multiple files will be created, including a BW track, and diagnostics showing the clustering of data', required=False);
parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='Where to output errors/warnings [default=stderr]', required=False);
parser.add_argument('-s',dest='sample', metavar='<sampleFrac>',help='The fraction of data points to use [default=0.01]', required=False, default = 0.01);
parser.add_argument('-a',dest='approach', metavar='<approach>',help='Approach to use; on eof {PCA,NMF,ICA} [default=PCA]', required=False, default = 'PCA');
parser.add_argument('-t',dest='iterations', metavar='<numIterations>',help='The number of iterations of NMF [default=1000]', required=False, default = 1000);
parser.add_argument('-r',dest='scaleTo', metavar='<scaleRangeTo>',help='How to scale the range of each track (none|SD|mean|max|median) [default=SD]', required=False, default="SD");
parser.add_argument('-x',dest='components', metavar='<numComponents>',help='The number of components to output [for PCA, defaults to all significant; required for NMF and ICA]', required=False,default=-1);
parser.add_argument('-k',dest='chunks', metavar='<chunkSize>',help='The size of the chunks (in bp) to read/write when outputting [default=500000]', default=500000, required=False);
parser.add_argument('-d',dest='dim', metavar='<graphDim>',help='Inches per graph [default=3]', required=False, default = 3);
parser.add_argument('-e',dest='eliminateMissing', action='count',help='eliminate base pairs with missing values; often, these represent 0s, so this is not always appropriate', required=False, default=0);
parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0);
args = parser.parse_args();
import os
import itertools
import warnings
import subprocess
import MYUTILS
import numpy as np
import scipy as sp
from scipy.ndimage.filters import gaussian_filter;
#from scipy import linalg
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn import decomposition
import sys
from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile
args.sample = float(args.sample);
args.dim = float(args.dim);
args.iterations = int(args.iterations);
args.components = int(args.components);
args.chunks = int(args.chunks);
args.scaleTo = args.scaleTo.upper();
if args.approach!="PCA" and args.components==-1:
raise Exception("components has no default for NMF/ICA - must specify with -x");
if (args.logFP is not None):
logFile=MYUTILS.smartGZOpen(args.logFP,'w');
sys.stderr=logFile;
IDs =[];
files = [];
smoothings = [];
defaultVal = [];
doLog = [];
inFile=MYUTILS.smartGZOpen(args.inFP,'r');
for line in inFile:
if line is None or line == "" or line.rstrip()=="" or line[0]=="#": continue
data=line.rstrip().split("\t");
if len(data)!=5: raise Exception("Incorrect number of fields in input: %s\n"%(line));
IDs.append(data[0]);
files.append(data[1]);
smoothings.append(float(data[2]));
defaultVal.append(float(data[3]));
doLog.append(float(data[4]));
inFile.close();
#get loci of interest and their sizes
chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r');
chromSizes = {};
chrOrder = [];
for line in chromSizesFile:
if line is None or line == "" or line[0]=="#": continue
data=line.rstrip().split("\t");
chromSizes[data[0]]=int(data[1]);
chrOrder.append(data[0]);
#determine what positions will be used in the data
useThese = {};
totalLength = 0
for chr in chrOrder:
#sample positions
useThese[chr] = np.random.random_sample((chromSizes[chr]))<args.sample;
totalLength = totalLength + np.sum(useThese[chr]);
#make a matrix of the data
allData = np.empty([totalLength,len(IDs)]);
if args.eliminateMissing>0:
keepThese = np.ones([totalLength]).astype(bool); # onlt those for which data was observed in all tracks
for i in range(0,len(IDs)):
#input GB tracks
curBW = BigWigFile(open(files[i]))
curTot = 0;
if args.verbose>1: sys.stderr.write("Inputting data for %s.\n"%(IDs[i]));
for chr in chrOrder:
if args.verbose>1: sys.stderr.write(" Inputting data for %s.\n"%(chr));
if args.verbose>2: sys.stderr.write(" Getting data from BW.\n");
values = curBW.get_as_array( chr, 0, chromSizes[chr] )
if values is None:
sys.stderr.write("%s is missing %s... skipping it for all\n"%(IDs[i],chr));
chrOrder.remove(chr)
allData = np.delete(allData, [range(curTot, (curTot+np.sum(useThese[chr])))],0);
if args.eliminateMissing>0:
keepThese = np.delete(keepThese, [range(curTot, (curTot+np.sum(useThese[chr])))],0);
totalLength = totalLength - np.sum(useThese[chr]);
del useThese[chr]
del chromSizes[chr]
continue
if args.verbose>2: sys.stderr.write(" Checking for missing data.\n");
if args.eliminateMissing>0:
#keepThese[curTot:(curTot+sum(useThese[chr]))] = np.logical_and(keepThese[curTot:(curTot+sum(useThese[chr]))], np.logical_not(np.isnan( values ))[useThese[chr]]);
keepThese[np.add(curTot,np.nonzero(np.isnan( values[useThese[chr]])))] = False;
#print(np.add(curTot,np.nonzero(np.isnan( values[useThese[chr]]))))
#fill in blanks; these are only used for smoothing anyways as keepThese will filter them out.
if args.verbose>2: sys.stderr.write(" Filling in missing data.\n");
invalid = np.isnan( values )
values[ invalid ] = defaultVal[i];
#log transform
if doLog[i]>=0:
if args.verbose>2: sys.stderr.write(" Log transforming data.\n");
values =np.log10(values + doLog[i])
#smooth data
if smoothings[i]!=0:
#reflect = at edge of array, the data will be mirrored
#truncate = don't incorporate data more than X away - probably a great speed increase since the distrib goes out to infinity
if args.verbose>2: sys.stderr.write(" Smoothing data.\n");
values = gaussian_filter(values, smoothings[i], mode='reflect', truncate=4.0)
#sample data
if args.verbose>2: sys.stderr.write(" Sampling data.\n");
values = values [useThese[chr]]
#append data;
if args.verbose>2: sys.stderr.write(" Appending data.\n");
allData[curTot:(curTot+len(values)),i] = values;
curTot = curTot + len(values);
if args.eliminateMissing>0:
totalLength=np.sum(keepThese);
allData = allData[keepThese,:];
if args.verbose>0: sys.stderr.write("Selected %i, ended up with %i, will only keep %i for missing data.\n"%(totalLength,allData.shape[0], np.sum(keepThese)));
dataMedians = np.median(allData,axis=0);
sys.stderr.write("Decomposing data based on %i sampled values\n"%(allData.shape[0]))
#scale allData such that the tracks are on a similar scale
if args.scaleTo=="MEAN":
scaleTo = np.mean(allData,axis=0);
elif args.scaleTo=="MAX":
scaleTo = np.max(allData,axis=0);
elif args.scaleTo=="MEDIAN":
scaleTo = np.median(allData,axis=0);
elif args.scaleTo=="NONE":
scaleTo = [1]*len(IDs);
elif args.scaleTo=="SD":
scaleTo = np.std(allData,axis=0);
else:
raise Exception("Unrecognized scale: %s\n"%(args.scaleTo));
if args.approach=="ICA":
np.divide(allData, np.array([scaleTo,]*allData.shape[0]), allData);# -> allData
myDecomp = decomposition.FastICA(n_components=args.components, whiten=True, max_iter=args.iterations);
myDecomp.fit(allData);
W = myDecomp.mixing_;
W = np.transpose(W);
compFile = open("%s_ica_mix.txt"%(args.outFPre),"w");
compFile.write("track\tcomponent_"+"\tcomponent_".join(map(str, range(0,args.components)))+"\n");
for i in range(0,len(IDs)):
compFile.write(IDs[i]+"\t"+"\t".join(map(str,W[:,i]))+"\n");
compFile.close()
elif args.approach=="NMF":
np.divide(allData, np.array([scaleTo,]*allData.shape[0]), allData);# -> allData
#TODO: check for negative values and die if they are present, telling user which data have negative values
#n_components = number of components to keep
#init= how to initialize
#sparseness {data|components|None]: where to enforce sparsity
#beta: degree of sparseness (larger is more sparse, default=1
#eta: degree of correctness to maintain; smaller=more error; default=0.1
#tol: tolerance in stopping
#max_iter: number of iterations
#random_state: seed
myDecomp = decomposition.NMF(sparseness='components', n_components=args.components, max_iter=args.iterations);
myDecomp.fit(allData);
#output stats to do with the fit
sys.stderr.write("Done NMF: reconstruction error = %g\n"%(myDecomp.reconstruction_err_)); #Frobenius norm of the matrix differences between the training data and the reconstructed data; something like a sqrt(sum of squared error)
# not really sure what the reconstruction error means in real terms since it depends on the amplitudes of the tracks
W = myDecomp.components_ #non-negative components of the data; these correspond to the weights of each of the components
# the components is a (components x tracks) matrix, corresponding to W in W*H =~ V, where V is the input GB tracks
#Therefore, the entries of W indicate how much each component contributes to each track.
compFile = open("%s_component_weights.txt"%(args.outFPre),"w");
compFile.write("track\tcomponent_"+"\tcomponent_".join(map(str, range(0,args.components)))+"\n");
for i in range(0,len(IDs)):
compFile.write(IDs[i]+"\t"+"\t".join(map(str,W[:,i]))+"\n");
compFile.close()
else: #PCA
#perform PCA to find axis of maximum variation
#center and scale data
dataMeans = np.mean(allData,axis=0);
allData = np.divide(np.subtract(allData,[dataMeans,]*totalLength), [scaleTo,]*totalLength);
myDecomp = decomposition.PCA()
myDecomp.fit(allData);
#myDecomp.components_
if args.components<=0:
args.components = myDecomp.n_components_;
W = myDecomp.components_[0:args.components,:];
sys.stderr.write("Done PCA: num components = %i. first PC explains %f%% of the variance \n"%(myDecomp.n_components_, myDecomp.explained_variance_ratio_[0]*100))
#PVE plot
plt.figure(figsize=(8,8),dpi=300);
plt.bar(range(1,len(myDecomp.explained_variance_ratio_)+1),myDecomp.explained_variance_ratio_, 0.5, color='b',label=None);
plt.xlabel('Component');
plt.ylabel('PVE');
plt.savefig('%s_PCA-PVE.png'%(args.outFPre))
plt.savefig('%s_PCA-PVE.pdf'%(args.outFPre))
transformedData = myDecomp.transform(allData);
componentNames = np.char.mod('comp_%d',range(1,transformedData.shape[1]+1));
colour = "black";
gridSizeC = [np.min((32767,args.components*args.dim)), np.min((32767,args.components*args.dim))];
plt.figure(figsize=gridSizeC,dpi=300);
for x in range(0,args.components):
for y in range(0,args.components): # each pair of inputs
splot = plt.subplot(args.components,args.components,x*args.components+y+1)
xUp = args.components-x-1;
plt.scatter(W[y,:], W[xUp,:],color = colour, alpha=1)
for i, lab in enumerate(IDs):
plt.gca().annotate(lab,(W[y,i],W[xUp,i]));
if y==0:
plt.ylabel(componentNames[xUp]);
if x==(args.components-1):
plt.xlabel(componentNames[y]);
plt.savefig('%s_track_weight_scatters.png'%(args.outFPre))
plt.savefig('%s_track_weight_scatters.pdf'%(args.outFPre))
colour = "black";
plt.figure(figsize=gridSizeC,dpi=300);
for x in range(0,args.components):
for y in range(0,args.components): # each pair of inputs
xUp = args.components-x-1;
splot = plt.subplot(args.components,args.components,x*args.components+y+1)
if xUp==y:
plt.hist(transformedData[:,y],100,facecolor = colour);
else:
plt.scatter(transformedData[:,y], transformedData[:,xUp],color = colour, alpha=0.1);
if y==0:
plt.ylabel(componentNames[xUp]);
if x==(args.components-1):
plt.xlabel(componentNames[y]);
plt.savefig('%s_all_comp_scatter.png'%(args.outFPre))
#plot first two components
plt.figure(figsize=(8,8),dpi=300);
plt.scatter(transformedData[:,0],transformedData[:,1],color = "b", alpha=0.1)
plt.xlabel(componentNames[0]);
plt.ylabel(componentNames[1]);
plt.savefig('%s_top2Components.png'%(args.outFPre))
corMat = np.empty([allData.shape[1],transformedData.shape[1]]);
for i in range(0,allData.shape[1]):
for j in range(0,transformedData.shape[1]):
corMat[i,j] = np.corrcoef(allData[:,i],transformedData[:,j])[1,0];
plt.figure(figsize=(10,8),dpi=300);
plt.pcolor(corMat, cmap=plt.cm.RdBu, vmin=-np.max(np.abs(corMat)), vmax=np.max(np.abs(corMat)));
plt.axis([0, transformedData.shape[1], 0, allData.shape[1]]) #x x, y y
plt.colorbar()
ax = plt.gca()
ax.set_xticks(np.arange(corMat.shape[1])+0.5, minor=False)
ax.set_yticks(np.arange(corMat.shape[0])+0.5, minor=False)
#ax.invert_yaxis() # because labels
#ax.xaxis.tick_top()
ax.set_yticklabels(IDs, minor=False)
ax.set_xticklabels(componentNames, minor=False)
plt.xticks(rotation=90)
#plt.gca().tight_layout()
plt.gcf().subplots_adjust(left=0.3,bottom=0.15)
plt.savefig('%s_track_corrs.png'%(args.outFPre))
plt.savefig('%s_track_corrs.pdf'%(args.outFPre))
allData=None;
transformedData=None;
allInBWs = [];
for i in range(0,len(IDs)):
#input GB tracks
curBW = BigWigFile(open(files[i]))
allInBWs.append(curBW);
sys.stderr.write("Making output streams\n");
outStreams = [];
for i in range(0, args.components):
if args.verbose>1: sys.stderr.write("Making wig output file: %s_comp%i.wig.gz.\n"%(args.outFPre,i+1));
outStreams.append( MYUTILS.smartGZOpen("%s_comp%i.wig.gz"%(args.outFPre,i+1),"w") );
for i in range(0, args.components):
outStreams[i].write("track type=wiggle_0\n")
for chr in chrOrder:
sys.stderr.write("Outputting components for %s:\n"%(chr));
last = 0;
final = chromSizes[chr];
while last!=final: # this breaks it up into chunks so that I'm not piping entire (human) chromosomes at once
curLast = np.min([last+args.chunks,final]);
chrData = np.empty([curLast-last,len(IDs)]);
if args.verbose>0: sys.stderr.write(" %s: Section %i - %i:\n"%(chr, last,curLast));
for i in range(0,len(IDs)):
if args.verbose>1: sys.stderr.write(" Input %i: %s.\n"%(i,IDs[i]));
values = allInBWs[i].get_as_array( chr, last, curLast )
#fill in blanks
invalid = np.isnan( values )
values[ invalid ] = dataMedians[i]; # at this stage, replace missing values with median
#log transform
if doLog[i]>=0:
values =np.log10(values + doLog[i])
#smooth data
if smoothings[i]!=0:
#reflect = at edge of array, the data will be mirrored
#truncate = don't incorporate data more than X away - probably a great speed increase since the distrib goes out to infinity
values = gaussian_filter(values, smoothings[i], mode='reflect', truncate=4.0)
#add data;
chrData[:,i] = np.divide(values,scaleTo[i]); # put on same scale as before
transed = myDecomp.transform(chrData);
for i in range(0,args.components):
if args.verbose>1: sys.stderr.write(" Printing component %i.\n"%(i+1));
try:
if last==0:
outStreams[i].write("fixedStep chrom=%s start=1 step=1\n"%(chr))
outStreams[i].write("\n".join(map(str,transed[:,i])));
outStreams[i].write("\n");
outStreams[i].flush();
except IOError as e:
sys.stderr.write("IOError on %s, component %i\n"%(chr,i))
sys.stderr.write("If this is a broken pipe, try reducing -k <chunks> from %i or increasing the amount of available RAM\n"%(args.chunks))
raise(e);
last=curLast;
sys.stderr.write("Output all data.\n");
for i in range(0,args.components):
outStreams[i].close()
toBW = subprocess.Popen(["wigToBigWig","%s_comp%i.wig.gz"%(args.outFPre,i+1),args.chrsFile,"%s_comp%i.bw"%(args.outFPre,i+1)])
temp = toBW.communicate()
if temp[0] is not None:
sys.stderr.write("component %i : %s"%(i+1,temp[0]));
if temp[1] is not None:
sys.stderr.write("component %i : %s"%(i+1,temp[1]));
if temp[0] is None and temp[1] is None and os.path.isfile("%s_comp%i.bw"%(args.outFPre,i+1)): # if no errors, delete the original
os.remove("%s_comp%i.wig.gz"%(args.outFPre,i+1))
else:
sys.stderr.write("Left %s_comp%i.bw because of error "%(args.outFPre,i+1));
if args.verbose>0: sys.stderr.write("Successfully closed wigToBigWig processes.\n");
if (args.logFP is not None):
logFile.close();