def run_ref_pca(path='dflt', data='dflt', save=True, label=True): """ requires pointing to a pre-assembled data file to reference new pca against""" if path == 'dflt': path = gt.dflt_outpath(fldr_name='newQC') refdf = assemble_ref_dat(path) ns = len(refdf.columns) if isinstance(data, str): if data == 'dflt': dat_file = '/Users/WRB/Dropbox/Areas of Focus/_Genometry/Analysis Projects/reference_ref_pca.csv' bulk_data = pd.read_csv(dat_file, delimiter='\t', index_col=0) else: bulk_data = data try: newcols = [x for x in refdf.columns if 'B01' not in x] refdf = refdf[newcols] except: pass alldata = pd.concat([bulk_data, refdf], axis=1) pca = decomposition.PCA(n_components=2).fit_transform(alldata.T) fig, ax = plt.subplots() ax.scatter(pca[:, 0], pca[:, 1], c='grey') ax.scatter(pca[-ns:, 0], pca[-ns:, 1], c='red') if label is True: for l, x, y in zip(refdf.columns.values, pca[-ns:, 0], pca[-ns:, 1]): plt.annotate(l, xy=(x, y), xytext=(-2,2), textcoords='offset points', ha='right', va='bottom') ax.set_title('REF RNA PCA') #pca = pd.DataFrame(pca, index=alldata.columns, columns=['comp1','comp2']) if save is True: outpath = gt.dflt_outpath(fldr_name=None, fn='pca.png') plt.savefig(outpath)
def make_plots_from_list(d, h, welllist, cats='nd', outpath='dflt', test=False): """ take a list of well ids (can be 3 char) and gets the matching reps with provided cats, and then plots skylines from the reps """ pname = d.name if outpath == 'dflt': outpath = gt.dflt_outpath() for w in welllist: wells = gt.get_well_reps(h, w, cats, df=True) try: if wells == 'empty': continue except ValueError: pass name = wells.iloc[0]['name'] + '-' + wells.iloc[0]['dose'] name = name.replace('.', ',') title = pname + '-' + name wids = wells.index.values wids = [x for x in wids if x in d.columns] myoutpath = os.path.join(outpath, title) new_skyline(d[wids], title=title, outpath=myoutpath) if test is True: print('test mode, exiting after one image') break
def analyze_plate(d, h, cats): """ take a dataset (assuming with doses) and generate standard output figs """ # should worry about organizing output too # create consensus, create SC plot dc, hc = pa.assemble_consensus(d, h, cats, save=True, sc=True) hc = gt.gen_label(hc, 'nd') dc.name = d.name # tSNE simple first pass, two parameters dim_reduct.tsne2(dc, hc, px=10, lr=[10, 150], inter=True) # create general dendrogram (only, no heatmap) dim_reduct.make_dendrogram(dc, labels=hc.label, outpath=True) # plot correlation matrix of the sorted combined zs by name and dose # can then follow up to plot the sweep or the clustered plottools.plot_correlation_matrix(dc, hc, title='dflt', sort=True, outpath=True, sparselabel=True, grid=True, labels=hc.name) if 'd' in cats: # plot landmark concs newcats = cats.replace('d', '') outpath = gt.dflt_outpath(fldr_name='landmark concs') #plottools.plot_landmark_concs(dc, hc, cats=newcats, genes='all', labels='dose', outpath=outpath) # call combo function to find genes that move and plot those dose-response plots (30 per plate) plottools.plot_ex_genes(d, h, n=10, mode='med')
def make_barview_range(edf, argdict, across='dose', label=False, outpath=False): """ with enrichment score results, plot barviews across the range of conditions, default dose """ cond_range = sorted(gt.hsub(edf, argdict)[across].unique()) print(argdict.values()) mytitle = ' '.join(argdict.values()) fig, axarr = plt.subplots(1, len(cond_range), sharey='row') for i, cond in enumerate(cond_range): my_ax = axarr[i] new_argdict = argdict if across is not None: new_argdict[across] = cond if label is True: make_barview(edf, new_argdict, ax=my_ax, label=cond) else: make_barview(edf, new_argdict, ax=my_ax) #fig.subplots_adjust(hspace=0.5) plt.suptitle(mytitle) plt.tight_layout() plt.subplots_adjust(top=0.9) if outpath is True: outpath = gt.dflt_outpath(fldr_name='foo') myoutpath = os.path.join(outpath, mytitle + '_enrich.png') plt.savefig(myoutpath) plt.close()
def bulk_test_enrich(sig, df, h, outpath=False): """ pass in a dataframe and a signature (tuple of up/down), and optionally map/header information to include in the returned enrichment scores""" up, dn = sig[0], sig[1] escore = {} for c in df.columns: escore[c] = test_enrichment(df[c], up, dn) edf = pd.DataFrame(escore) edf = edf.T # create local scaled enrichment pmax = edf['absolute'].max() pmin = edf['absolute'].min() edf['scaled'] = edf['absolute'].apply(lambda x: x / pmax if x > 0 else (x / (-1 * pmin)) if x < 0 else 0) edf['scaled'] = edf['scaled'].apply(lambda x: float('{:.3f}'.format(x))) edf = edf[['scaled', 'absolute', 'up', 'dn']] # optionally merge results with sample header obj if h is not None: edf = pd.merge(h, edf, left_index=True, right_index=True, how='inner') edf.sort_values('scaled', ascending=False, inplace=True) if outpath is not False: if outpath == 'dflt': outpath = gt.dflt_outpath(fn=df.name + '_enrichment.xlsx') edf.to_excel(outpath) return edf
def plot_gene(sample_set, h=None, name='dflt', outpath='dflt', close=True, width=8): """ basic plot gene finction, if header is provided will apply color coding blue = veh, red = poscon """ if name == 'dflt': name = sample_set.name if outpath == 'dflt': outpath = gt.dflt_outpath(fldr_name='dflt') xrange = len(list(sample_set)) dtype = check_plottype(sample_set) #print('dtype is ', dtype) ax = format_concentration_plot(xrange, ptype=dtype, width=width) ax.scatter(range(xrange), sample_set.values, color='grey') ax.set_title(name) if h is not None: h['order'] = np.arange(1, len(h) + 1) dv, hv = gt.dsub(sample_set, h, {'type': 'vehicle'}) ax.scatter(hv.order, dv.values, color='blue') dp, hp = gt.dsub(sample_set, h, {'type': 'poscon'}) ax.scatter(hp.order, dp.values, color='red') if close is True: plt.savefig(os.path.join(outpath, name + '.png')) plt.close() else: return ax
def check_data(path='dflt'): """ a better final map checker """ if path == 'dflt': path = gt.dflt_outpath(fldr_name='finaldata') flist = gt.get_flist(path, ext='.gct') maplist = gt.get_flist(path, ext='.txt') maplist.extend((gt.get_flist(path, ext='.xlsx'))) for f in flist: shn = gt.get_shn(f).split('.')[0] try: mapfile = [x for x in maplist if shn in x][0] except: print(f'error with map file {shn}') g = gct.Gct(f) g.get_headers() g.get_wells() datwells = g.wells mymap = gct.extractmap(mapfile) mapwells = gt.hsub(mymap, {'type':['vehicle', 'poscon', 'test']})['well'].values res = set(datwells) - set(mapwells) if len(res) == 0: print(f'{shn} ok, {380-len(datwells)} failed wells') else: print(f'eror with map/data {shn}, {len(datwells)}/{len(mapwells)}')
def gen_euclideans(df, labels='dflt', rot=None, tick_denom=1, test=False): """ loops plotting euclidean matrix to use different upper trim boundaries and font sizes for labels """ outdir = gt.dflt_outpath(fldr_name='matrices') try: name = df.name except: name = df.columns[0].split(':')[0] Y = get_euclidean(df) maxv = round(Y.max()) lims = [1, .75, .5, .3, .15] for fs in [8, 5]: for ul in lims: cap = int(round(maxv * ul)) ax = plot_euclidean(df, labels=labels, upper=cap, fontsize=fs, dat=Y, tick_denom=tick_denom, rot=rot) outpath = os.path.join( outdir, name + f'_euclidean_ul{str(ul).replace(".",",")}-fs{fs}.png') #plt.savefig(outpath, bbox_inches='tight') plt.savefig(outpath) plt.close() if test is True: sys.exit('test mode, quitting after one')
def check_final(path='dflt'): """ check numbers of row/columns, number of fails and decimal places of final data files """ if path == 'dflt': path = gt.dflt_outpath(fldr_name='finaldata') f_list = gt.get_flist(path, ext='.gct') for file in f_list: g = gct.Gct(file) g.get_headers() try: txt = g.file.split('.')[0] + '.txt' except: try: txt = g.file.split('.')[0] + '.xlsx' except: pass try: print(sub_check_failed(g, txt)) fails, fail_result = sub_check_failed(g, txt) result = sub_check_lines(g) and sub_check_columns(g) and fail_result dplaces = sub_check_decimal(g) except FileNotFoundError: result = False fails = 'no map!!' print('{} - {} - {} failed wells - {} dplaces'.format(g.shortname, result, fails, dplaces))
def compare_plate_genes(flist, genelist, numrows=3, type=True, plate=False, title='dflt', outpath='dflt', remove=True): """ plots the listed genes across the dataframes provided in list of file paths flist. the plots will be generated and then combined using img bulk stack. orient is direction of the joined images. type will include sample type color coding -- should add in grid support --- """ if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') if outpath == 'dflt': outpath = gt.dflt_outpath(fldr_name='tmp_imgs') for f in flist: d, h = gct.extractgct(f) for g in genelist: if plate is False: plot_gene_wtypes(d.loc[g], h, name=d.name + '-' + g, outpath=outpath) elif plate is True: make_plateplot(d.loc[g], name=d.name + '-' + g, outpath=outpath) if title != 'dflt': combined_outpath = gt.dflt_outpath(fldr_name=title + ' combined_imgs') else: combined_outpath = gt.dflt_outpath(fldr_name='combined_imgs') if numrows is False: imgs.bulk_stack(outpath, outpath=combined_outpath, delim='-', idx=1, pad=.05) else: imgs.bulk_stack_grid(outpath, outpath=combined_outpath, numrows=numrows, delim='-', idx=1, pad=.05) if remove is True: shutil.rmtree(outpath)
def html_scatter(fig, ax, h, x_col, y_col, title, labels='dflt', outpath='dflt'): outpath = gt.check_dfltarg(outpath, gt.dflt_outpath('foo')) labels = gt.check_dfltarg(labels, h.label) scatter = ax.scatter(h[x_col].tolist(), h[y_col].tolist(), alpha=0.001) tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels) mpld3.plugins.connect(fig, tooltip) myoutpath = os.path.join(outpath, title + '.html') mpld3.save_html(fig, myoutpath)
def distribute_qc(path = 'dflt'): if path is 'dflt': inpath = gt.dflt_outpath(fldr_name='newQC') outpath = gt.dflt_outpath(fldr_name='QCprocessing') folders = ['calibs', 'flogps', 'escore', 'cellid-nolabel', 'cellid-label', 'euclidean'] folders = [os.path.join(outpath, x) for x in folders] srch_terms = ['finalqc/*calibplot', 'finalqc/*FLOGP', 'escore_summary*/', '*cell_line/*cellid_nolabel/*-*cellid_circle', '*cell_line/*-*cellid_circle', '-*euclidean'] for term, fold in zip(srch_terms, folders): try: os.makedirs(fold) except OSError as e: if e.errno != errno.EEXIST: raise srch = '*'.join(['', term, '' ]) + '.png' for file in gt.globit(inpath, srch): shutil.copy(file, fold)
def plot_kmeans_clusters(df, cat_dict): """ plot correlation matrices for a dataset according to the passed label dictionary this is designed to be used with the dictionary output of 'kmeans_clusters' above""" outpath = gt.dflt_outpath(fn=df.name) for cat, labels in cat_dict.items(): myoutpath = outpath + f'_{cat}_clusters.png' plottools.plot_correlation_matrix(df, labels=labels, sort=True, outpath=myoutpath, sparselabel=True)
def run_methods(df, h, outpath='dflt', hdr='dflt', hue='name', mets='dflt', shape=None, labels='dflt', scaling=None): """ run a selection of alternate dimension reduction techniques""" method_results = dict() n_neighbors = 15 try: hdr = gt.check_dfltarg(hdr, df.name) except: hdr = df.columns[0].split(':')[0] outpath = gt.check_dfltarg(outpath, gt.dflt_outpath(fldr_name='dim_reduct')) labels = gt.check_dfltarg(labels, h.label) mets = gt.check_dfltarg(mets, ['PCA','ISO','MDS','LLE']) if ':' in df.columns.values[0]: df = df.T if scaling is not None: if scaling == 'maxabs': df = MaxAbsScaler().fit_transform(df) elif scaling == 'robust': df = RobustScaler().fit_transform(df) elif scaling == 'std': df = StandardScaler(with_mean=False).fit_transform(df) else: print('error with scaling') if 'PCA' in mets: # Projection on to the first 2 principal components method_results['PCA'] = decomposition.PCA(n_components=2).fit_transform(df) if 'ISO' in mets: # Isomap projection of the digits dataset method_results['ISO'] = manifold.Isomap(n_neighbors, n_components=2).fit_transform(df) if 'MDS' in mets: # MDS embedding of the digits dataset method_results['MDS'] = manifold.MDS(n_components=2, n_init=1, max_iter=100).fit_transform(df) if 'LLE' in mets: # Locally linear embedding method_results['LLE'] = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified').fit_transform(df) for met, dat in method_results.items(): xcol, ycol = f'{met}_x', f'{met}_y' h[xcol] = dat[:, 0] h[ycol] = dat[:, 1] title = hdr + ' ' + met seaborn_scatter(h, title, outpath, hue=hue, x=xcol, y=ycol, labels=labels, shape=shape, legend='brief')
def plot_lmconcs(flist, stack=False, test=False): """ plot lm concs with full output for all files in list, w/ optional joins""" if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') for f in flist: d, h = gct.extractgct(f) #ds, hs = gt.dsub(d, h, {'name':['5-Iodotubercidin', 'ERK5-IN-1']}) outpath = gt.dflt_outpath(fldr_name='landmark concs') plottools.plot_landmark_concs(d, h, genes='all', labels='wells', outpath=outpath, test=test) if stack is True: imgs.bulk_stack(outpath, orient='vert', delim='_', idx=2)
def resize_qc(path='dflt'): """ run through folder contents and subdirs and conduct appropriate image resizing""" if path is 'dflt': path = gt.dflt_outpath(fldr_name='QCprocessing') fl = glob.glob(path + '/**/*.png', recursive=True) for f in fl: if 'calibplot' in f: calplot(f, f) elif '_es1' in f: escoresum(f, f) elif 'FLOGP' in f: flogp(f, f) elif 'euclidean' in f: euclidean(f, f) elif 'cellid' in f: cellid(f, f)
def make_dendrogram(df, labels='dflt', orient='top', outpath=True, trunc=False, res=False): """ uses scipy ward clustering to form dendrogram, trunc folds up last detailed splits """ linked = linkage(df.T, 'ward') fig, ax = plt.subplots() if isinstance(labels, str) and labels == 'dflt': labels = df.columns if trunc is False: dend = dendrogram(linked, orientation=orient, labels=labels, distance_sort='descending', show_leaf_counts=True, leaf_rotation=90) else: dend = dendrogram(linked, orientation=orient, labels=labels, distance_sort='descending', show_leaf_counts=True, truncate_mode='lastp', show_contracted=True, leaf_rotation=90) name = gt.dflt_name(df) fig.suptitle(f'{name} n={len(df.columns)} dendrogram', fontweight='bold') plt.tight_layout() if outpath is True: outpath = gt.dflt_outpath(fn=name + '_dend.png') plt.savefig(outpath) plt.close() elif outpath != False: plt.savefig(outpath) plt.close() else: if res is False: return ax else: return dend
def predict_cells(input, save=False): """ can accept directory and loop through files or one dataframe at a time, uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction when save is True a dataframe will be saveh""" with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p', 'rb') as file: clf = pickle.load(file) if isinstance(input, str): if os.path.isdir(input): vlist = gt.globit(input, '*_Qctrl_n*') if len(vlist) == 0: vlist = gt.globit(input, '*QNORM*') else: vlist = [input] elif isinstance(input, pd.Series): try: res = clf.predict([input])[0] except: print('error with series prediction') res = None return res else: vlist = input res_table = pd.DataFrame() for f in vlist: try: d, h = gct.extractgct(f) except: vlist[0] = d vlist[1] = h ds, hs = gt.dsub(d, h, {'type': 'vehicle'}) if len(ds) == 0: print('error, maybe using ZS file? use QNORM instead') return None for b in hs.batch.unique(): dsb, hsb = gt.dsub(ds, hs, {'batch': b}) med = dsb.median(axis=1).values shn = gt.get_shn(f) + '-' + b res = clf.predict([med])[0] res_table.loc[shn, 'cell'] = res print(f'{shn} - {res}') if save is True: res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t') return res_table
def save_sig(sig, gs=False, name='dflt', path='dflt'): """ pass a signature tuple of up/down and name/path to save those lists """ if name is 'dflt': name = 'mysig' try: name = name.replace(':', '-') except AttributeError: pass if path is 'dflt': path = gt.dflt_outpath(fldr_name='foo') up, dn = sig[0], sig[1] if gs is True: up = convert_to_symbols(up) dn = convert_to_symbols(dn) filename = name + '_up.grp' gt.savelist(up, os.path.join(path, filename)) filename = name + '_dn.grp' gt.savelist(dn, os.path.join(path, filename))
def plate_comparison(flist, scat=True, corr=True, dat=False): """ for a range of gct files consolidate median, mean, std, cv from each and then create pairwise scatterplots for each. flexible in number of plates """ if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') outpath = gt.dflt_outpath(fldr_name='comparisons') ddict = cll.OrderedDict() hdict = cll.OrderedDict() for i, f in enumerate(flist): name = gt.get_shn(f) df, h = gct.extractgct(f) ddict[name], hdict[name] = df, h if i == 0: baseindex = df.index medians = pd.DataFrame(index=baseindex) medians.name = 'median gene values' stdev = pd.DataFrame(index=baseindex) stdev.name = 'gene standard deviations' cv = pd.DataFrame(index=baseindex) cv.name = 'gene coefficient of variation' average = pd.DataFrame(index=baseindex) average.name = 'gene average' for n, d in ddict.items(): medians[n] = d.median(axis=1) stdev[n] = d.std(axis=1) cv[n] = d.std(axis=1) / d.mean(axis=1) average[n] = d.mean(axis=1) for dset in [medians, stdev, cv, average]: if scat is True: sns.pairplot(dset) plt.tight_layout() plt.suptitle(dset.name) plt.savefig(os.path.join(outpath, dset.name + 'scatter.png')) plt.close() if dat is True: dset.to_excel(os.path.join(outpath, dset.name + '.xlsx')) if corr is True: ax = plottools.plot_euclidean(dset, dset.columns) ax.set_title(dset.name) plt.tight_layout() plt.savefig(os.path.join(outpath, dset.name + 'matrix.png')) plt.close()
def get_and_save_sig(inst, t, gs=False, name='dflt', path='dflt'): """ automatically save generated signature with default or provided file name and destination folder. gs flag saves things in terms of gene symbols """ if name is 'dflt': name = inst.name try: name = name.replace(':', '-') except AttributeError: pass if path is 'dflt': path = gt.dflt_outpath(fldr_name='foo') up, dn = get_sig(inst, t) if gs is True: up = convert_to_symbols(up) dn = convert_to_symbols(dn) filename = name + '_up.grp' gt.savelist(up, os.path.join(path, filename)) filename = name + '_dn.grp' gt.savelist(dn, os.path.join(path, filename)) return (up, dn)
def summarize_csvs(path): """ provide path containing csv files to generate output summarizing levels 1 and 10 for the plate as well as the posamp and ref """ if path is None: path = gt.dflt_outpath(fldr_name='csv') results = cll.defaultdict(dict) f_list = gt.get_flist(path, '.csv') for file in f_list: try: c = Gcsv(file) d = c.build_dframe() results[c.shortname]['plate-L10'] = d['Analyte 10'].mean(axis=0) results[c.shortname]['Pos-L10'] = d.ix['B1']['Analyte 10'] results[c.shortname]['Ref-L10'] = d.ix[['A2', 'B2' ]]['Analyte 10'].mean() results[c.shortname]['plate-L1'] = d['Analyte 1'].mean(axis=0) except: print('error with ' + file) res = pd.DataFrame(results) res = res.T outpath = os.path.join(path, 'csv_summary.txt') res.to_csv(outpath, sep='\t', float_format='%.0f')
def combine_fails(path='dflt', ret=False, summ=False, sep=False, thresh=1): if path == 'dflt': path = gt.dflt_outpath(fldr_name='newQC') fl = gt.globit(path, '*QC_fail*') files = ' '.join(fl) #cmd_str = 'cat ' + files + ' > ' + os.path.join(path, 'QC_fail.txt') #subprocess.run(cmd_str, shell=True) datlist = [] for f in fl: dat = pd.read_csv(f, sep='\t', skiprows=1) dropcols = [x for x in dat.columns if 'Unnamed' in x] dat = dat.drop(dropcols, axis=1) dat.dropna(inplace=True) try: dat = dat[dat['Batch'] != ' '] except: pass if sep == False: try: dat = dat[dat['Batch'] != 'Batch'] except: pass datlist. append(dat) data = pd.concat(datlist, axis=0) data.to_csv(os.path.join(path, 'QCfail_summary.txt'), sep='\t') if summ is True: gbname = data.groupby('PERT_DESC').size() print(gbname[gbname > thresh]) gbbatch = data.groupby('Batch').size() print(gbbatch[gbbatch > thresh]) # this subsets down to show how many doses totally fail (3 reps each) per name # g = f.groupby(['PERT_DESC', 'DOSE']).size() # res = g[g > 2].groupby('PERT_DESC').size().sort_values(ascending=False) if ret is True: return data
def plate_map_vis(myseries, cmap='dflt', path='dflt'): """ just translate directly into a dict or overwrite to use? returns just the array to plot """ num_cats = len(myseries.unique()) cat2num = dict(zip(myseries.unique(), range(num_cats))) data = myseries.apply(lambda x: cat2num[x]) if cmap == 'dflt': if num_cats < 10: cmap = 'tab10' maxcats = 10 else: camp = 'tab20' maxcats = 20 if path is 'dflt': outpath = gt.dflt_outpath(fn=myseries.name) else: outpath = os.path.join(path, myseries.name) plottools.plot_plateplot(data, outpath=outpath, label=data.name, ncats=maxcats, cmap=cmap, clrbar=cat2num)
def separate_subset_folders(path, mylist, down=False, dest='dflt'): """ copy top level folders over from path to destination if the folders match any terms in mylist, all folders transferred over as-is. if not found, printed """ if dest is 'dflt': dest = gt.dflt_outpath() for st in mylist: if down is False: fl = glob.glob(path + st + '*') elif down is True: fl = glob.glob(path + '*/' + st) cplist = [x for x in fl if os.path.isdir(x)] try: dirpath = cplist[0] bn = os.path.basename(dirpath) try: shutil.copytree(dirpath, os.path.join(dest, bn)) except FileExistsError: pass except IndexError: print(st, ' not found')
def plot_cohorts(vdict, outpath='dflt', mode='sep', dtype='auto', maxx='dflt', title='foo', label=True, incr=1, size=20): """ given a dictionary with name : [values], plot them all on the same plot but with different color (and optional size). Can be promiscuity values, or concentration range breakdown will give a dictionary in return, optionally dtype 'auto' will figure out either gct or zs, but can be specified """ if outpath == 'dflt': outpath = gt.dflt_outpath() # set the color pallet and spacing/sizing levels (figsize tuned to these) cmap = plt.get_cmap('tab10') # parametetrs controlling the optional labels below each cohort txt_args = {'fontsize': 8, 'rotation': 90, 'fontweight': 'bold'} # set initial color counters and x starting position ci = 0 x_pos = 1 # is this duplicated with the stuff below? try: maxv = round( max([max(abs(max(v)), abs(min(v))) for v in vdict.values()])) except: try: maxv = round(max([max(abs(v)) for v in vdict.values()])) except: try: maxv = round(max([abs(v).max() for v in vdict.values()])) except: maxv = round(max([abs(v).max().max() for v in vdict.values()])) maxv += maxv * 0.1 # calc x range with length of vector corrected by reps, plus spacing btwn if mode is 'sep': try: maxx = sum([len(x.columns) for x in vdict.values()]) * incr + incr except: maxx = sum([len(x) for x in vdict.values()]) * incr + incr elif mode is 'tog': maxx = len(vdict.keys()) * incr + incr # pull out the first value set to check for plot formatting for i, vals in enumerate(vdict.values()): if i >= 1: break myvals = vals # create and baseline format plot if dtype == 'auto': dtype = check_plottype(myvals) # adjust plot type if auto adjusted if dtype == 'zs' and maxv > 10: maxy = round(maxv + 1) ax = format_concentration_plot(maxx, ptype=dtype, maxy=maxy) else: ax = format_concentration_plot(maxx, ptype=dtype) ax.set_xlim([0, maxx + 1]) ax.set_xlabel('') # determine title, if title is 'foo': try: title = myvals.name except: try: title = myvals.columns[0] except: title = 'foo' ax.set_title(title) if dtype == 'gct': y_label = min(ax.get_ylim()) * 0.75 elif dtype == 'zs': y_label = min(ax.get_ylim()) * 1.2 for n, vals in vdict.items(): #try: # vals = vals.values[0] #except: # print('vals error...') # pass # increment through colors in cmap color = cmap(ci) ci += 1 if ci > 9: ci = 0 # catch to handle empty values if len(vals) == 0: xlength = 1 vals = [] else: xlength = len(vals) # set x coordinates for values if mode is 'sep': x_vals = [x_pos + (x * incr) for x in range(xlength)] x_pos = max(x_vals) + incr elif mode is 'tog': x_vals = [x_pos] * xlength x_pos += incr # plot the current vals with specified color and size # implement catches in case no values for a given entry try: ax.scatter(x_vals, vals, color=color, s=size) except: ax.scatter(x_vals, [0], color='white', s=size) #print(vals) # then add label for each cohort below if label is True: if len(x_vals) > 1: x_label = (x_vals[0] + x_vals[-1]) / 2 else: x_label = x_vals[0] - 1 ax.text(x_label, y_label, n, color=color, **txt_args) #return ax plt.savefig(os.path.join(outpath, title + '.png'), bbox_inches='tight') plt.close()
def make_dotplot(vctr, wdict=None, title='dflt', outpath='dflt', legend=False, width=5): """ passing in a series, label and well dictionary of highlighted cohorts with name: [wells] """ if outpath == 'dflt': outpath = gt.dflt_outpath(fldr_name='output figs') cmap = plt.get_cmap('tab10') xrange = len(list(vctr)) dtype = check_plottype(vctr.iloc[2]) #print('dtype is ', dtype) ax = format_concentration_plot(xrange, ptype=dtype, width=width) # set additional title and axis ax.set_ylabel(vctr.name, fontsize=12) if title == 'dflt': title = vctr.index.values[0].split(':')[0] + ' - ' + vctr.name ax.set_title(title) awells = list(vctr.index) # plot primary data plt.plot(vctr.values, color='silver', marker='o', ls='', markersize=5, mew=0) ci = 0 allwells = [] [allwells.extend(w) for w in wdict.values()] if not any([w in vctr.index for w in allwells]): print('well dictionary not aligned, attempting patch') wdict2 = {} pname = vctr.index.values[0].split(':')[0] for name, wells in wdict.items(): wdict2[name] = [pname + ':' + w for w in wells] wdict = wdict2 if wdict is not None: mycolors, mynames = [], [] for name, wells in wdict.items(): color = cmap(ci) mycolors.append(color) mynames.append(name) ci += 1 if ci > 9: ci = 0 mywells = [x for x in wells if x in vctr.index] if len(mywells) == 0: print('no wells left') sety = vctr[mywells] setx = [awells.index(w) for w in mywells] plt.plot(setx, sety, color=color, marker='o', ls='', markersize=5, mew=0) if legend is not False: leg_dict = dict(zip(mynames, mycolors)) # create a patch (proxy artist) for every color patches = [ mpatches.Patch(color=mycolors[i], label=text) for i, text in enumerate(leg_dict.keys()) ] # put those patched as legend-handles into the legend lgd = plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) filename = title + '.png' plt.savefig(os.path.join(outpath, filename)) plt.close()
def plot_landmark_concs(df, h, maxy=12, cats='n', labels='dflt', genes='test100', outpath='dflt', title='dflt', dosenum='dflt', test=False): """ plot many or all landmarks, should pass in a subset dataframe and header which should be the consensus ZS file. can contain many different names + doses, will auto breakdown by 'nd' a single line per gene is plotted for the ZS across all concentrations labels can be 'dflt' for just incr numbers, or 'wells' for address, or 'dose' for numbers """ # txt_args = {'fontsize': 8, # 'rotation': 90, # 'fontweight': 'bold'} if outpath is 'dflt': outpath = gt.dflt_outpath() df, h = gt.dsub(df, h, {'type': 'test'}) names = h.name.dropna().unique() doses = gt.hsub(h, {'name': names[0]})['dose'].dropna().unique() if len(gt.hsub(h, {'name': names[0], 'dose': doses[0]})) > 1: print('dataframe not collapsed to consensus, bogus lm concs') print(gt.hsub(h, {'name': names[1], 'dose': doses[0]}).head()) for ds, hs in pa.breakdown(df, h, cats, dic=False): #hs['dose'] = pd.to_numeric(hs['dose']) hs.sort_values('dose', ascending=True, inplace=True) ds = ds[hs.index] xrange = len(hs.dose.unique()) - 2 ax = format_concentration_plot(xrange, maxy=maxy, width=4) ax.tick_params(axis='x', bottom='on', top='off', labelbottom='on') if dosenum == 'dflt': dose_range = range(len(hs.dose.unique())) else: dose_range = range(dosenum) ax.set_xticks(dose_range) if labels == 'dflt': ax.set_xticklabels([str(x + 1) for x in dose_range]) elif labels == 'wells': # temporary labels ax.set_xticklabels(hs.index, rotation=45) elif labels == 'dose': ax.set_xticklabels(hs['dose'].unique(), rotation=45) else: try: ax.set_xticklabels(labels) except: print('problem with x range labels') # set title and name if title == 'dflt': try: mytitle = df.name except: mytitle = hs['plate'].values[0] mytitle = mytitle.strip('_sub') suffix = '' for c in cats: cat = gt.cats_lookup(c) attr = hs[cat].values[0][0] suffix += f' - {attr}' mytitle += suffix ax.set_title(mytitle, fontsize=14) for g in gt.get_genes(genes, df=df): data = ds.loc[g, :] ax.plot(data.values, linewidth=0.3) plt.tight_layout() plt.savefig(os.path.join(outpath, mytitle + '.png')) plt.close() if test is True: print('stopping after one iteration') break
def plot_correlation_matrix(df, h, ptype='corr', title='dflt', labels='dflt', sort=False, lower=0.25, upper=1.0, outpath=False, cmap='dflt', sparselabel=False, grid=False): """ plots pearson correlation matrix between columns of the passed in dataframe. the labels can be used to sort the samples, sparselabel only prints one category label per section/cluster, and outpath will save to designated location, otherwise just display the 'lower' argument trims bottom of graph, so that there's less noise at the bottom end the sparselabel designation requires sorting, otherwise things work out funny _should improve label handling """ fig, ax = plt.subplots() fig.set_size_inches(8.5, 8.5) if isinstance(labels, str) and labels == 'dflt': try: mylabels = h.label.values except: mylabels = df.columns if labels is None: mylabels = len(df.columns) * [''] else: mylabels = labels if sparselabel is True: sort = True if sort is True: print(f'{len(df.columns)} columns, {len(labels)} labels') keyd = dict(zip(df.columns, mylabels)) neword = sorted(df.columns, key=lambda x: keyd[x]) mylabels = [keyd[x] for x in neword] df = df[neword] if ptype == 'corr': corr = df.corr() elif ptype == 'euclid': cmap = 'rev' corr = get_euclidean(df, df=True) max = corr.max().max() upper = (1 - lower) * max lower = 0 if lower is not None: corr = corr.clip(lower=lower) if upper is not None: corr = corr.clip(upper=upper) if cmap == 'dflt': cmap = blue_red_cmap() elif 'rev' in cmap: cmap = blue_red_cmap(['red', 'white', 'cornflowerblue']) cax = ax.imshow(corr, interpolation='nearest', cmap=cmap) #cbar = fig.colorbar(cax, ticks=[-1,0,1]) if sparselabel is False: # x axis minor = np.arange(0.5, len(df.columns), 1) major = np.arange(0.5, len(df.columns), 1) ax.set_xticks(major, minor=False) ax.xaxis.set_tick_params(size=0) ax.set_xticks(minor, minor=True) # y axis minor = np.arange(0.5, len(df.columns), 1) major = np.arange(0, len(df.columns), 1) ax.set_yticks(major, minor=False) ax.yaxis.set_tick_params(size=0) ax.set_yticks(minor, minor=True) ax.set_xticklabels(mylabels, rotation=45, ha='right') ax.set_yticklabels(mylabels) elif sparselabel is True: unq_labels = sorted(list(set(mylabels))) cntr = cll.Counter(mylabels) ticks, mylabels, i, major, minor = [1], [], -.5, [], [] for cat in unq_labels: chunk_size = cntr[cat] label_loc1 = i + chunk_size / 2 label_loc2 = i + chunk_size #print(f'{cat} : len {cntr[cat]} at position {label_loc1}') major.append(label_loc1) minor.append(label_loc2) #ticks.extend([label_loc1, label_loc2]) #mylabels.extend(['', cat]) mylabels.append(cat) i += chunk_size ax.set_xticks(major, minor=False) ax.xaxis.set_tick_params(size=0) ax.set_xticks(minor, minor=True) ax.set_yticks(major, minor=False) ax.yaxis.set_tick_params(size=0) ax.set_yticks(minor, minor=True) ax.set_xticklabels(mylabels, rotation=45, ha='right') ax.set_yticklabels(mylabels) if grid is True: ax.grid(which='minor', axis='both', color='black') if title == 'dflt': try: title = df.name except AttributeError: title = df.columns[0].split(':')[0] ax.set_title(f'{title} - n={len(df.columns)} corr matrix', style='oblique') plt.tight_layout() if outpath is not False: if outpath is True: plt.savefig(gt.dflt_outpath(fn=title + '_corr.png')) else: if '.png' in outpath: plt.savefig(outpath) else: plt.savefig(outpath + '.png') plt.close()
def plot_plateplot(vctr, name='dflt', outpath='dflt', label='dflt', cmap='inferno', ncats=None, clrbar=True): """ will plot a 384 well plate with the values passed in the Series object vector, will map to all wells and plot with auto-adjusted colors in the provided map with colorbar w/ values if clrbar is True. Otherwise can pass dictionary into the clrbar variable to plot a separate individual legend with keys as the name and values as the converted integer used to plot the map """ if name == 'dflt': name = vctr.index.values[0].split(':')[0] + '-' + vctr.name # elif '_' not in name: # name = name + ' - ' + vctr.name else: name = name if outpath == 'dflt': outpath = gt.dflt_outpath() if label == 'dflt': #label = vctr.name label = name fig, ax = plt.subplots() # set additional title and axis ax.set_title(label, y=1.1, fontsize=16) row_labels = list(string.ascii_uppercase[0:16]) ax.set_yticks(list(np.arange(16))) ax.set_yticklabels(row_labels, fontsize=8) col_labels = list(np.arange(1, 25)) ax.set_xticks(list(np.arange(0, 24))) ax.set_xticklabels(col_labels, fontsize=9) ax.tick_params(labelright=True, labeltop=True) # this sets the tick length to zero, but leaves labels plt.tick_params(axis=u'both', which=u'both', length=0) # reshape array and plot try: d = vctr.values.reshape(16, 24) except: print('error in reshape') return if ncats is not None: im = plt.imshow(d, interpolation='nearest', cmap=cmap, vmin=0, vmax=ncats) else: im = plt.imshow(d, interpolation='nearest', cmap=cmap) # use matplotlib axes1 to keep colorbars in line with figs if clrbar is True: divider = make_axes_locatable(ax) cax = divider.append_axes('right', size='5%', pad=0.3) cbar = plt.colorbar(im, cax=cax) cbar.ax.tick_params(labelsize=9) # simplify colorbar to 5 points including max/min mx, mn = vctr.max(), vctr.min() mid = (mx + mn) / 2 svth = mid + ((mx - mid) / 2) twth = (mid - ((mx - mid) / 2)) things = [mn, twth, mid, svth, mx] # things = [mn, mid, mx] thingsl = ['{:.1f}'.format(x) for x in things] cbar.set_ticks(things) cbar.set_ticklabels(thingsl) elif clrbar is not True: # get the colors of the values, according to the # colormap used by imshow leg_dict = clrbar colors = [im.cmap(im.norm(value)) for value in leg_dict.values()] # create a patch (proxy artist) for every color patches = [ mpatches.Patch(color=colors[i], label=text) for i, text in enumerate(leg_dict.keys()) ] # put those patched as legend-handles into the legend lgd = plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) #plt.tight_layout() #if not outpath.endswith('.png'): # outpath += '.png' outpath = os.path.join(outpath, name + '.png') try: fig.savefig(outpath, bbox_extra_artists=(lgd, ), bbox_inches='tight') except: plt.savefig(outpath) plt.close()