def html_scatter(fig, ax, h, x_col, y_col, title, labels='dflt', outpath='dflt'): outpath = gt.check_dfltarg(outpath, gt.dflt_outpath('foo')) labels = gt.check_dfltarg(labels, h.label) scatter = ax.scatter(h[x_col].tolist(), h[y_col].tolist(), alpha=0.001) tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels) mpld3.plugins.connect(fig, tooltip) myoutpath = os.path.join(outpath, title + '.html') mpld3.save_html(fig, myoutpath)
def run_methods(df, h, outpath='dflt', hdr='dflt', hue='name', mets='dflt', shape=None, labels='dflt', scaling=None): """ run a selection of alternate dimension reduction techniques""" method_results = dict() n_neighbors = 15 try: hdr = gt.check_dfltarg(hdr, df.name) except: hdr = df.columns[0].split(':')[0] outpath = gt.check_dfltarg(outpath, gt.dflt_outpath(fldr_name='dim_reduct')) labels = gt.check_dfltarg(labels, h.label) mets = gt.check_dfltarg(mets, ['PCA','ISO','MDS','LLE']) if ':' in df.columns.values[0]: df = df.T if scaling is not None: if scaling == 'maxabs': df = MaxAbsScaler().fit_transform(df) elif scaling == 'robust': df = RobustScaler().fit_transform(df) elif scaling == 'std': df = StandardScaler(with_mean=False).fit_transform(df) else: print('error with scaling') if 'PCA' in mets: # Projection on to the first 2 principal components method_results['PCA'] = decomposition.PCA(n_components=2).fit_transform(df) if 'ISO' in mets: # Isomap projection of the digits dataset method_results['ISO'] = manifold.Isomap(n_neighbors, n_components=2).fit_transform(df) if 'MDS' in mets: # MDS embedding of the digits dataset method_results['MDS'] = manifold.MDS(n_components=2, n_init=1, max_iter=100).fit_transform(df) if 'LLE' in mets: # Locally linear embedding method_results['LLE'] = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified').fit_transform(df) for met, dat in method_results.items(): xcol, ycol = f'{met}_x', f'{met}_y' h[xcol] = dat[:, 0] h[ycol] = dat[:, 1] title = hdr + ' ' + met seaborn_scatter(h, title, outpath, hue=hue, x=xcol, y=ycol, labels=labels, shape=shape, legend='brief')
def enrich_matrix(df, h, labels='dflt', type='scaled'): """ from a passed dataframe, calculate pairwise enrichment auto thresholds of features above zscore limited by number """ zsthresh = 2 numthresh = 50 en_mtrx = pd.DataFrame() labels = gt.check_dfltarg(labels, h.label) for sample, label in zip(df.columns, h.label.values): # determine thresholds (up, dn) = sigs.get_sig(df[sample], zsthresh, numlim=numthresh) enres = sigs.bulk_test_enrich((up, dn), df, h) en_mtrx = pd.concat([en_mtrx, enres[type]], axis=1) en_mtrx.columns = h.label en_mtrx.set_index(h.label, inplace=True) return en_mtrx
def run_plate_analysis(mode='ind', cats='nd', path='dflt'): """ runs standard analysis on either each plate individually 'ind' or all togegther 'comb' most useful for plates with doses. the default loc default path will be newQC on the desktop """ path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC')) fl = gt.globit(path, '*ZSVCQNORM*') print(fl) if mode == 'comb': dl, hl = [], [] for i, f in enumerate(fl): d, h = gct.extractgct(f) if i == 0: try: pname = d.name + '+' except: pname = h.addr[0].split(':')[0] + '+' if len(h.batch.unique()) > 1: # fix sample labels for plate/batch h.plate = h.plate + h.batch # define labels (should I add plate?) h = gt.gen_label(h, cats) dl.append(d) hl.append(h) try: d = pd.concat(dl, axis=1) d.name = pname except ValueError: sys.exit('no gct file plates to analyze') h = pd.concat(hl, axis=0) analyze_plate(d, h, cats) elif mode == 'ind': for f in fl: d, h = gct.extractgct(f) # define labels (should I add plate?) h = gt.gen_label(h, cats) analyze_plate(d, h, cats)
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False): """" for the path load all files and collapse vehicles, plot matrix batches can be all or 'A' only to just take the first one. getcells will re-predict cells """ path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC')) flv = gt.globit(path, '*Qctrl*') if len(flv) == 0: flv = gt.globit(path, '*_QNORM_*') # should put in a check to extract from regular qnorms dlist, hlist = [], [] for f in flv: d, h = gct.extractgct(f) h['plate'] = h['plate'].apply(lambda x: x[:6]) d, h = gt.dsub(d, h, {'type': 'vehicle'}) if batch == 'all': for b in h.batch.unique(): ds, hs = gt.dsub(d, h, {'batch': b}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) elif batch == 'A': ds, hs = gt.dsub(d, h, {'batch': 'A'}) med = ds.median(axis=1) hs = gt.gen_label(hs, 'pb', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) else: med = d.median(axis=1) hs = gt.gen_label(hs, 'p', delim=delim) dlist.append(med) hlist.append(hs.iloc[0]) vdf = pd.concat(dlist, axis=1) vh = pd.DataFrame(hlist) vdf.columns = vh.label if getcells is True: vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x])) vh['label'] = vh.label + delim + vh.cell2 vdf.columns = vh.label return vdf, vh
def find_plot_genes(df, thresh='dflt', lim=75): """ open a gct file and make list of genes which have ZS above the threshold, now updated to auto determine the list at 97.5% and above, also can input a hard value instead""" print(df.name) # auto determine threshold, currently 97.5 thresh = gt.check_dfltarg(thresh, df.quantile(0.975).max()) # apply filter subset = df[abs(df) > thresh] subset.dropna(axis=0, how='all', inplace=True) subset['max'] = subset.apply(lambda x: max(x.min(), x.max(), key=abs), axis=1) subset = subset.reindex(subset['max'].abs().sort_values( ascending=False, inplace=False).index) #give brief view of top and bottom vals print(subset['max'].head(n=6)) print(subset['max'].tail(n=6)) print(len(subset.index.values)) if len(subset.index.values) > lim: result_list = subset.index.values[:lim] else: result_list = subset.index.values return result_list