/
qcbplotting.py
481 lines (341 loc) · 15.9 KB
/
qcbplotting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""
Winnie Leung
QCB Analysis Codes
Functions include:
Get count tables for each condition
PCA, IsoMap, LDA plots with Correlation coefficients table
Table of stats with mean and std of each group
Scatter plots with t-test and p-values
2D Scatter plots with 95% CI
3D Scatter plots
"""
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import scipy.odr
import seaborn as sns
import sklearn.preprocessing as preprocessing
from scipy import stats
def get_condition_counts(df, *features):
"""
:param df:
:param features:
:return:
"""
"""
Function groups df by list of input features and returns a dataframe
with the count of each grouped subset.
Example:
>>> get_condition_counts(df, 'structure_name', 'drug_label')
This gives number of cells for each drug_label per structure
Parameters
:param df: pd.DataFrame
:param features: str
:return: pd.dataframe with counts, grouped by the list of inputs
"""
conditions = df.groupby(list(features)).size().reset_index(name='Count')
return conditions
def get_corr_coeff(struc_subset, T): # T is 1-D list of one of PC
"""
Function calculates correlation coefficient between every column of struc_subset and a column vector of T.
:param struc_subset: pd.dataframe
:param T: 1-D list of one of the components from PCA or LDA
:return: list of correlation coefficients
"""
corr_coeff = []
col_list = list(struc_subset)
for col in col_list:
corr = np.corrcoef(struc_subset[col], T)[1, 0] # just corrcoeff returns a 2x2 matrix (diagonal 1's)
corr_coeff.append(corr)
return corr_coeff
def scale_features_df(df):
"""
Function for pre-processing dataframe before PCA. It uses sklearn's preprocessing standard scaler to remove
the mean and scale to unit variance for all the features (will have mean of 0 and standard deviation of 1).
Returns scaled dataset.
:param df: pd.dataframe
:return: scaled pd.dataframe
"""
scaled = preprocessing.StandardScaler().fit_transform(df)
scaled = pd.DataFrame(scaled, columns=df.columns)
return scaled
def plot_T(T, color_selection, structure, mapping, graphtype=None, addtotitle=None, drug_lab=None, savegraphs=False,
savedir=Path('')):
"""
Function will take plot histogram, 2D, or 3D graph of dimensionality reduction results from PCA or LDA.
Graph type will depend on dimensionality of transformed T (dimensionality is number of classes - 1).
For example, histogram will be plotted if T is reduced to 1-D from having 2 classes (e.g. 2 drug groups).
Function will tag strings to the graph title to distinguish between graphs if they're supplied.
:param T: transformed array from sklearn's lda or pca fit_transform function. Each row represents 1 cell. Columns
are that cell's value in component 1, 2, 3, etc.
:param color_selection: list of strings denoting plotting color with length equals to number of drug_labels in that
structure group
:param structure: string variable of structure being analyzed. Must match format as appear in dataframe.
:param mapping: List of drug_label strings
:param graphtype: string to tag onto graph title. E.g. 'PCA', 'LDA', etc
:param addtotitle: string to tag onto graph title telling what is being plotted. e.g. "DNA Features",
"Cell Features", "Structure Features"
:param drug_lab: series of drug_label names.
:param savegraphs: boolean. Default False will not save graphs. True will save out graphs to png.
:param savedir: Path object of directory to save out graphs to
:return: pd.dataframe same length as the number of cells in the structure group. Each row of T_lab would have
the drug group that the cell belongs too, as well as its values in C1, 2, 3, as applicable
"""
dimensionality = T.shape[1]
column_data = {f'C{i + 1}': T[:, i] for i in range(dimensionality)}
T_lab = pd.DataFrame({'drug_label': drug_lab,
**column_data}) # ** for expands dictionary
fig = plt.figure()
projection = '3d' if dimensionality == 3 else None
ax = fig.add_subplot(111, projection=projection)
if addtotitle is None:
title = f'{graphtype}__{structure}'
else:
title = f'{graphtype} of {structure}__{addtotitle}'
ax.set_title(title)
ax.set_xlabel('Component 1')
if dimensionality > 1:
ax.set_ylabel('Component 2')
if dimensionality > 2:
ax.set_zlabel('Component 3')
for drug in drug_lab.unique():
xyz = T_lab.groupby('drug_label').get_group(drug)
index = mapping.index(drug)
color = color_selection[index]
if dimensionality == 1:
ax.hist(*[xyz[c].values for c in xyz.columns if c.startswith('C')],
label=drug, alpha=0.5)
continue
else:
ax.scatter(*[xyz[c].values for c in xyz.columns if c.startswith('C')],
c=color, label=drug, alpha=0.75)
# Plot legend
plt.legend()
if savegraphs:
fig.set_size_inches(10, 6)
fig.savefig(savedir / f'{title}.png')
else:
# Plot
plt.show()
return T_lab
def get_corr_table(struc_subset, T_lab, sort_by='Abs(C1)'):
"""
Function takes in struc_subset dataframe and reference T_lab dataframe to compute correlation coefficients
between each transformed component to each feature column in the original, untransformed data.
There is one correlation coefficient per feature column to each transformed component.
:param struc_subset: pd.dataframe
:param T_lab: pd.dataframe of transformed, dimension-reduced T labeled with drug group for each cell
:param sort_by: string of column title to sort correlation table descending by
:return: pd.dataframe of correlation table for each feature to each component.
Sorted descending by column specified.
"""
dimensionality = T_lab.shape[1] - 1 # subtracted one because first column is added drug_label
c_table = {}
c_table.update(
{'C1_corr': get_corr_coeff(struc_subset, T_lab['C1'])}) # same correlation using original scaled or unscaled
if dimensionality > 1:
c_table.update({'C2_corr': get_corr_coeff(struc_subset, T_lab['C2'])})
if dimensionality > 2:
c_table.update({'C3_corr': get_corr_coeff(struc_subset, T_lab['C3'])})
corr_data_abs = {f'Abs(C{i + 1})': [abs(x) for x in c_table[f'C{i + 1}_corr']] for i in range(dimensionality)}
corr_data = {f'C{i + 1}': c_table[f'C{i + 1}_corr'] for i in range(dimensionality)}
corr_table = pd.DataFrame({'Feature': list(struc_subset),
**corr_data_abs,
**corr_data})
corr_table_sorted = corr_table.sort_values(by=[sort_by],
ascending=False)
return corr_table_sorted
def get_plot_order(df, control, by='drug_label'):
"""
Function will take df dataframe, control group string name, and group_by parameter and return a list of
strings denoting plot order with control being first element
:param df: pd.dataframe
:param control: string name of control group
:param by: string denoting feature name to group by
:return: list of strings denoting plot order with control being first element
"""
plot_order = [control]
for drug in list(df[by].unique()):
if drug not in plot_order:
plot_order.append(drug)
return plot_order
def plot_scatter_ci(df, structure, features, control='Vehicle', groupby='drug_label',
addtotitle=None, plotallstruc=False, plot_order="",
savegraphs=False, savedir=Path(''), y_lim=None):
"""
Plot scatter plots with seaborn module for each category with 95% confidence interval.
:param df: pd.dataframe
:param structure: string name of structure currently being analyzed
:param features: list of features to plot each graph in
:param control:string name of control group, if applicable
:param groupby:string name as parameter/category to group df by
:param addtotitle: string tag to the title
:param plotallstruc: boolean. Default False for looking at features within one structure group.
True would give option to plot all structures on the same graph
:param plot_order: list of strings denoting plot_order of graphs
:param savegraphs: boolean. Default False will not save graphs. True will save out graphs to png.
:param savedir: Path object of directory to save out graphs to
:param y_lim: tuple. parameter with ymin and ymax for normalizing scale on all graphs
:return: None
"""
if plot_order == "":
plot_order = get_plot_order(df, control, by=groupby)
# Set up the matplotlib figure
sns.set(style="darkgrid")
for feature in features:
fig = plt.figure()
ax = fig.add_subplot(111)
# To make sure all plots are on same scale, hard-code y-limit
if y_lim is not None:
plt.ylim(*y_lim)
if addtotitle is None:
if plotallstruc:
title = f'{feature}'
else:
title = f'{structure}__{feature}'
else:
if plotallstruc:
title = f'{feature}__{addtotitle}'
else:
title = f'{structure}__{feature}__{addtotitle}'
ax.set_title(title)
ax = sns.pointplot(x=groupby, y=feature, data=df, ci=95, size=3,
color='k', join=False, order=plot_order,
estimator=np.mean)
# Make sure pointplot on top
plt.setp(ax.lines, zorder=100)
plt.setp(ax.collections, zorder=100, label="")
sns.stripplot(x=groupby, y=feature, data=df, jitter=True,
palette="Pastel1", edgecolor='k', size=5,
order=plot_order)
if savegraphs:
fig.set_size_inches(10, 6)
fig.savefig(savedir / f'{title}.png')
plt.show()
def plot_scatter_2d(df, structure, mapping, color_selection, plot_foi, *doi,
x_lab='dna_volume', odrreg=False, addtotitle=None,
savegraphs=False, savedir=Path(''), y_lim=None):
"""
Function will plot 2D scatter plots with y-axis being each of the features provided and x-axis being dna_volume by
default or user's input. If skipped plotting any drug group, will print message with error to console.
:param df: pd.dataframe
:param structure: string name of structure currently being analyzed
:param mapping: List of drug_label strings
:param color_selection: list of string denoting plotting color with length equals to number of drug_labels
in that structure group
:param plot_foi: list of feature of interest to plot. Generates one plot per foi
:param doi: string extension of drug of interest to plot
:param x_lab: string name to go on x-axis. Default "dna_volume"
:param odrreg: boolean. Default False. True would plot an odrreg line with R^2 value and slope
:param addtotitle: optional. string to tag to graph title
:param savegraphs: boolean. Default False will not save graphs. True will save out graphs to png.
:param savedir: Path object of directory to save out graphs to
:param y_lim: tuple. parameter with ymin and ymax for normalizing scale on all graphs
:return: None
"""
if isinstance(doi[0], list):
doi = doi[0]
for foi in plot_foi:
# To make sure all plots are on same scale, hard-code y-limit
if y_lim is not None:
plt.ylim(*y_lim)
for drug in doi:
index = mapping.index(drug)
fig = plt.figure()
ax = fig.add_subplot(111)
y_lab = foi
if addtotitle is None:
title = f'{structure}__{foi}__{drug}'
else:
title = f'{structure}__{foi}__{addtotitle}__{drug}'
ax.set_title(title)
ax.set_xlabel(f'{x_lab}')
ax.set_ylabel(f'{y_lab}')
try:
drug_group = df.groupby('drug_label').get_group(drug)
color = color_selection[index]
ax.scatter(drug_group[x_lab], drug_group[y_lab],
c=color, label=drug)
x = drug_group[x_lab]
y = drug_group[y_lab]
x_norm = x / np.mean(x)
y_norm = y / np.mean(y)
slope, intercept, r_value, p_value, std_err = stats.linregress(x_norm, y_norm)
r_sq = r_value ** 2
if odrreg:
# Model for fitting
linear_model = scipy.odr.Model(linear_f)
# Real Data Object
data = scipy.odr.Data(x, y)
# Set up ODR with model and data
odr = scipy.odr.ODR(data, linear_model, beta0=[0, 1])
odr.set_job(fit_type=0)
out = odr.run()
# Generate fitted data
y_fit = linear_f(out.beta, x)
# y_fit = linear_f(out.beta, x_norm)
# ax.plot(x, y_fit*np.mean(y), c='k', label='ODR')
odrslope = out.beta[0]
ax.plot(x, y_fit, c='k', label=f'ODR. $R^2$: {r_sq:.3f}, slope={odrslope:.3f}')
plt.legend()
except Exception as err:
print(f'Skipped plotting {drug}: ({err})')
pass
if savegraphs:
fig.set_size_inches(10, 6)
fig.savefig(savedir / f'{title}.png')
def linear_f(p, x):
"""
Function takes p and x to get slope, constant, and array of x-values to generate a linear function
:param p: slope and constant
:param x: series of x values
:return: series of y values linearly mapped to input x values
"""
m, c = p
print(type(p))
return m * x + c
def plot_scatter_3d(df, structure, mapping, color_selection, plot_foi,
x_lab='dna_volume', y_lab='cell_volume', savegraphs=False,
savedir=Path('')):
"""
Function takes in dataframe df and list of features in plot_foi to plot 3D plots against dna_volume and cell_volume
with the specified structure group and color_selection mapped to each drug_label from mapping list
:param df: pd.dataframe
:param structure: string name of structure currently being analyzed
:param mapping: List of drug_label strings
:param color_selection: list of string denoting plotting color with length equals to number of drug_labels in that
structure group
:param plot_foi: list of features to plot
:param x_lab: string name of feature to plot on x-axis. Default value is 'dna_volume'
:param y_lab: string name of feature to plot on y-axis. Default value is 'cell_volume'
:param savegraphs: boolean. Default False will not save graphs. True will save out graphs to png.
:param savedir: Path object of directory to save out graphs to
:return: None
"""
for foi in plot_foi:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
z_lab = foi
title = f'{structure}__{foi}'
ax.set_title(title)
ax.set_xlabel(f'{x_lab}')
ax.set_ylabel(f'{y_lab}')
ax.set_zlabel(f'{z_lab}')
for drug in mapping:
index = mapping.index(drug)
try:
drug_group = df.groupby('drug_label').get_group(drug)
color = color_selection[index]
ax.scatter(drug_group[x_lab], drug_group[y_lab],
drug_group[z_lab], c=color, label=drug)
except Exception as err:
print(f'Skipped plotting {drug}: ({err})')
pass
if savegraphs:
fig.set_size_inches(10, 6)
fig.savefig(savedir / f'{title}.png')
# Plot
plt.legend()
plt.show()