def grab_ranks(path, feat, hilo=1, t=40): """ survey folder for the wells in whic given gene istop ranked wells of given feat by sorted z score). generates overall list ofdefault rank output is in descending order, highest zscore = 1 hilo: 1 = high, upregulated genes (default rank order) 0 = low, downnregulated genes """ outpath = os.path.join(path, '_rank_summary.txt') flist = gt.get_flist(path, 'ranks.gct') # set dummy starting point for low rank lowest = 500 # create blank template dataframe summary = pd.DataFrame() for f in flist: d, h = gct.extractgct(f) # flip rank order as needed if hilo > 1: d = 978 - d # get column ids for ranks below threshold wells = d.columns[d.ix[feat] < t] # extract portion of dataframe ranks = d.ix[feat, wells] ranks = pd.DataFrame(ranks) # assign plate column to each well id entry, re-order cols ranks['plate'] = gt.get_shn(f).split('-')[0] # concat portion to overall dataframe summary = pd.concat([summary, ranks]) # check and store the lowest rank newlow = min(d.ix[feat]) if newlow < lowest: lowest = newlow # re-shuffle the column order summary['well'] = summary.index.values summary = summary[['plate', 'well', feat]] print('\n', feat, int(lowest)) summary.to_csv(outpath, sep='\t', index=None)
def get_s2n_genes(g, c1, c2): d, h = gct.extractgct(g) c1w = gt.hsub(h, {'well': c1}).index c2w = gt.hsub(h, {'well': c2}).index d1 = d[c1w] d2 = d[c2w] res = sig_to_noise(d1, d2) return res
def run_plate_analysis(mode='ind', cats='nd', path='dflt'): """ runs standard analysis on either each plate individually 'ind' or all togegther 'comb' most useful for plates with doses. the default loc default path will be newQC on the desktop """ path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC')) fl = gt.globit(path, '*ZSVCQNORM*') print(fl) if mode == 'comb': dl, hl = [], [] for i, f in enumerate(fl): d, h = gct.extractgct(f) if i == 0: try: pname = d.name + '+' except: pname = h.addr[0].split(':')[0] + '+' if len(h.batch.unique()) > 1: # fix sample labels for plate/batch h.plate = h.plate + h.batch # define labels (should I add plate?) h = gt.gen_label(h, cats) dl.append(d) hl.append(h) try: d = pd.concat(dl, axis=1) d.name = pname except ValueError: sys.exit('no gct file plates to analyze') h = pd.concat(hl, axis=0) analyze_plate(d, h, cats) elif mode == 'ind': for f in fl: d, h = gct.extractgct(f) # define labels (should I add plate?) h = gt.gen_label(h, cats) analyze_plate(d, h, cats)
def run_granularity(path): flist = gt.get_flist(path, '.gct') c = cll.Counter() for f in flist: d, h = gct.extractgct(f) survey_granularity(d, c) c = pd.Series(c, name='count') c.sort_values(ascending=False, inplace=True) c = c[c > 1] c.to_excel(os.path.join(path, 'counter.xlsx'))
def assemble_ref_dat(path): """ to gather together all reference RNA wells within the given path """ fl = gt.globit(path, '*_ref_n*') dl, hl = [], [] for f in fl: dr, hr = gct.extractgct(f) dr, hr = gt.dsub(dr, hr, {'well':['A02','B02']}) dr = round(dr, 2) dl.append(dr) hl.append(hr) alldata = pd.concat(dl, axis=1) return alldata
def plot_lmconcs(flist, stack=False, test=False): """ plot lm concs with full output for all files in list, w/ optional joins""" if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') for f in flist: d, h = gct.extractgct(f) #ds, hs = gt.dsub(d, h, {'name':['5-Iodotubercidin', 'ERK5-IN-1']}) outpath = gt.dflt_outpath(fldr_name='landmark concs') plottools.plot_landmark_concs(d, h, genes='all', labels='wells', outpath=outpath, test=test) if stack is True: imgs.bulk_stack(outpath, orient='vert', delim='_', idx=2)
def compare_plate_genes(flist, genelist, numrows=3, type=True, plate=False, title='dflt', outpath='dflt', remove=True): """ plots the listed genes across the dataframes provided in list of file paths flist. the plots will be generated and then combined using img bulk stack. orient is direction of the joined images. type will include sample type color coding -- should add in grid support --- """ if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') if outpath == 'dflt': outpath = gt.dflt_outpath(fldr_name='tmp_imgs') for f in flist: d, h = gct.extractgct(f) for g in genelist: if plate is False: plot_gene_wtypes(d.loc[g], h, name=d.name + '-' + g, outpath=outpath) elif plate is True: make_plateplot(d.loc[g], name=d.name + '-' + g, outpath=outpath) if title != 'dflt': combined_outpath = gt.dflt_outpath(fldr_name=title + ' combined_imgs') else: combined_outpath = gt.dflt_outpath(fldr_name='combined_imgs') if numrows is False: imgs.bulk_stack(outpath, outpath=combined_outpath, delim='-', idx=1, pad=.05) else: imgs.bulk_stack_grid(outpath, outpath=combined_outpath, numrows=numrows, delim='-', idx=1, pad=.05) if remove is True: shutil.rmtree(outpath)
def predict_cells(input, save=False): """ can accept directory and loop through files or one dataframe at a time, uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction when save is True a dataframe will be saveh""" with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p', 'rb') as file: clf = pickle.load(file) if isinstance(input, str): if os.path.isdir(input): vlist = gt.globit(input, '*_Qctrl_n*') if len(vlist) == 0: vlist = gt.globit(input, '*QNORM*') else: vlist = [input] elif isinstance(input, pd.Series): try: res = clf.predict([input])[0] except: print('error with series prediction') res = None return res else: vlist = input res_table = pd.DataFrame() for f in vlist: try: d, h = gct.extractgct(f) except: vlist[0] = d vlist[1] = h ds, hs = gt.dsub(d, h, {'type': 'vehicle'}) if len(ds) == 0: print('error, maybe using ZS file? use QNORM instead') return None for b in hs.batch.unique(): dsb, hsb = gt.dsub(ds, hs, {'batch': b}) med = dsb.median(axis=1).values shn = gt.get_shn(f) + '-' + b res = clf.predict([med])[0] res_table.loc[shn, 'cell'] = res print(f'{shn} - {res}') if save is True: res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t') return res_table
def plate_comparison(flist, scat=True, corr=True, dat=False): """ for a range of gct files consolidate median, mean, std, cv from each and then create pairwise scatterplots for each. flexible in number of plates """ if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') outpath = gt.dflt_outpath(fldr_name='comparisons') ddict = cll.OrderedDict() hdict = cll.OrderedDict() for i, f in enumerate(flist): name = gt.get_shn(f) df, h = gct.extractgct(f) ddict[name], hdict[name] = df, h if i == 0: baseindex = df.index medians = pd.DataFrame(index=baseindex) medians.name = 'median gene values' stdev = pd.DataFrame(index=baseindex) stdev.name = 'gene standard deviations' cv = pd.DataFrame(index=baseindex) cv.name = 'gene coefficient of variation' average = pd.DataFrame(index=baseindex) average.name = 'gene average' for n, d in ddict.items(): medians[n] = d.median(axis=1) stdev[n] = d.std(axis=1) cv[n] = d.std(axis=1) / d.mean(axis=1) average[n] = d.mean(axis=1) for dset in [medians, stdev, cv, average]: if scat is True: sns.pairplot(dset) plt.tight_layout() plt.suptitle(dset.name) plt.savefig(os.path.join(outpath, dset.name + 'scatter.png')) plt.close() if dat is True: dset.to_excel(os.path.join(outpath, dset.name + '.xlsx')) if corr is True: ax = plottools.plot_euclidean(dset, dset.columns) ax.set_title(dset.name) plt.tight_layout() plt.savefig(os.path.join(outpath, dset.name + 'matrix.png')) plt.close()
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False): """" for the path load all files and collapse vehicles, plot matrix batches can be all or 'A' only to just take the first one. getcells will re-predict cells """ path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC')) flv = gt.globit(path, '*Qctrl*') if len(flv) == 0: flv = gt.globit(path, '*_QNORM_*') # should put in a check to extract from regular qnorms dlist, hlist = [], [] for f in flv: d, h = gct.extractgct(f) h['plate'] = h['plate'].apply(lambda x: x[:6]) d, h = gt.dsub(d, h, {'type': 'vehicle'}) if batch == 'all': for b in h.batch.unique(): ds, hs = gt.dsub(d, h, {'batch': b}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) elif batch == 'A': ds, hs = gt.dsub(d, h, {'batch': 'A'}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) else: med = d.median(axis=1) hs = gt.gen_label(hs, 'p', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) vdf = pd.concat(dlist, axis=1) vh = pd.DataFrame(hlist) vdf.columns = vh.label if getcells is True: vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x])) vh['label'] = vh.label + delim + vh.cell2 vdf.columns = vh.label return vdf, vh
def get_zscore(fpath, save=True, my_mad=None): """ merged from separate zscore file. can either save the resulting file or return data the first fpath argument can be a file path or a [d, h] object already""" # basic setup if isinstance(fpath, str): g = gct.Gct(fpath) g.get_headers() df, h = gct.extractgct(fpath) else: try: df = fpath[0] h = fpath[1] except: print('error with path') zsd = cll.defaultdict(dict) pname = gt.get_shn(fpath) for b in h['batch'].dropna().unique(): if b == 'na': continue print('running zscore for {} batch {}'.format(pname, b)) vw = gt.hsub(h, {'batch': b, 'type': 'vehicle'}).index.values if len(vw) == 0: break veh = df[vw] # get median value across vehicle populations med = veh.median(axis=1) # populate the absolute deviation values per gene ad = cll.defaultdict(list) for v in veh.columns: for f in veh.index: ad[f].append(abs(med[f] - veh[v][f])) # assemble the median absolute value per gene mad = {} for k, v in ad.items(): r = statistics.median(v) if 0 < r < 0.1: r = 0.1 mad[k] = r # using the above progress though test and poscon wells # to calculate sample zscores tw = list(h[(h['batch'] == b) & (h['type'] == 'test')].index.values) pw = list(h[(h['batch'] == b) & (h['type'] == 'poscon')].index.values) wells = tw + pw for w in df[wells].columns: for feat in df.index: if my_mad is not None and mad[feat] < my_mad: zs = (df[w][feat] - med[feat]) / (my_mad * 1.486) elif mad[feat] == 0: zs = 0 else: zs = (df[w][feat] - med[feat]) / (mad[feat] * 1.486) zsd[w][feat] = '{0:.3f}'.format(zs) # transform into dataframe, set index, null nonsense zsdf = pd.DataFrame(zsd) hs = h.loc[zsdf.columns] zsdf = zsdf.replace(['inf', '-inf'], np.nan).fillna('nan') if save is True: outpath = '{}_ZS.gct'.format(fpath.split('_', 1)[0]) gct.save_headergct(zsdf, hs, outpath) else: return zsdf, hs
def assemble_consensus(df, h, cats, sc=True, legend='brief', plot=False, skyl=False, n=None, save=False, ret=True, test=False): """ tool to assemble replicate zscore consensus, pass df, header and the breakdown categories 'nd' for instance will return the consolidated df and header file can pass in multiple gct files as a single string via F6 shortcut ccs will calculate the zscore correlation of replicates, and insert that into header df plot will use seaborn pairplot to visualize the calculated rep correlations above skyl controls skyline plot generation, can be True to plot all ind reps plus consensus n argument is a limiter to only consider treatments with enough replicates, including into consensus gct!! save will save the consensus gct file """ if isinstance(df, str): df, h = gct.extractgct(df) else: print('error in loading dataframe') outpath = gt.dflt_outpath(fldr_name='output figs') try: pname = df.name except: pname = h.addr[0].split(':')[0] outpath = os.path.join(outpath, pname) try: os.mkdir(outpath) except: pass subs = breakdown(df, h, cats, dic=False) con_data = pd.DataFrame(index=df.index) addnl = [] addnl.extend(['corr', 'all ccs']) addnl.extend(['prom', 'all proms', 'porder']) if addnl != []: con_header = pd.DataFrame(index=list(h.columns.values) + addnl) else: con_header = pd.DataFrame(index=h.columns) for ds, hs in subs: if n is not None: if len(ds.columns) < n: print('not enough reps', hs.iloc[0]) continue c = consensus(ds, name='first') con_data = pd.concat([con_data, c], axis=1) new_annot = hs.iloc[0, :].copy().T new_annot.well = hs['well'].values new_annot.addr = hs['addr'].values corrs = [] for i in range(len(ds.columns)): for j in range(1 + i, len(ds.columns)): corrs.append( round(ds.iloc[:, i].corr(ds.iloc[:, j], method='pearson'), 2)) if len(corrs) == 0: # print('corrs = na') # print(hs.iloc[0].values) new_annot['corr'] = np.nan new_annot['all ccs'] = np.nan elif len(corrs) == 1: new_annot['corr'] = round(corrs[0], 2) new_annot['all ccs'] = corrs else: new_annot['corr'] = round(np.percentile(corrs, 75), 2) new_annot['all ccs'] = corrs corrs = [decimal.Decimal(x) for x in corrs] new_annot['corr'] = pd.to_numeric(new_annot['corr']) proms = abs(ds).sum(axis=0).round().values porder = hs['well'].values new_annot['prom'] = round(np.percentile(proms, 75)) new_annot['all proms'] = proms new_annot['porder'] = porder if plot is True: ds.columns = [x + ' - ' + hs.loc[x]['batch'] for x in ds.columns] ax = sns.pairplot(ds) myoutpath = os.path.join(outpath, 'rep zs scatter') try: os.mkdir(myoutpath) except: pass plt.savefig( os.path.join(myoutpath, h.plate[0] + '-' + ds.name + '.png')) plt.close() con_header = pd.concat([con_header, new_annot], axis=1) if skyl is True: myoutpath = os.path.join(outpath, 'skyline') try: os.mkdir(myoutpath) except: pass try: name = hs.iloc[0]['name'] + '-' + str( hs.iloc[0]['dose']) + '-' + hs.iloc[0]['batch'] except: name = hs.iloc[0]['name'] + '-' + hs.iloc[0]['batch'] name = name.replace('.', ',') title = pname + '-' + name myoutpath = os.path.join(myoutpath, title) skyline.new_skyline(ds, title=title, outpath=myoutpath) if test is True: break con_header = con_header.T if sc is True: try: pname = df.name except: pname = h.addr[0].split(':')[0] title = pname + ' sc plot' outpath = gt.dflt_outpath(fn=pname + '_scplot.png') kwargs = {'x': 'corr', 'y': 'prom', 'data': con_header} kwargs.update({'alpha': .75, 'style': 'type', 'legend': legend}) if 'd' in cats: kwargs['hue'] = 'name' kwargs['size'] = 'dose' kwargs['sizes'] = (40, 400) # this is experimental else: kwargs['sizes'] = (50) kwargs['hue'] = 'name' g = sns.relplot(**kwargs) g.fig.suptitle(title) g.fig.set_size_inches(7, 5) if legend is not None: for lh in g._legend.legendHandles: lh.set_alpha(.75) g.savefig(outpath, bbox_inches='tight') plt.close() con_header = gt.gen_label(con_header, 'nb') newfig, newax = dim_reduct.seaborn_scatter(con_header, title, outpath, x='corr', y='prom', ptype='ax', save=False) dim_reduct.html_scatter(newfig, newax, con_header, 'corr', 'prom', title) plt.close() con_data.name = df.name if save is True: gct.save_headergct(con_data, con_header, gt.dflt_outpath(fn=df.name + '_consensus.gct')) if ret is True: return con_data, con_header