def openmap(path, ext='all'): """ bulk map opening, flexible by type, but watch out for mismatched dimensions of different maps """ if isinstance(ext, str) and ext == 'all': #exts = ['.gct', '.txt', '.xlsx', '.xls'] exts = ['.gct', '.txt', '.xlsx'] pathlist = [] if os.path.isdir(path): for extsn in exts: pathlist.extend(gt.get_flist(path, ext=extsn)) else: pathlist = path for extsn in exts: pathlist = gt.splitpaths(pathlist, ext=extsn) if isinstance(pathlist, str): print('only one map') combined = extractmap(path) return combined else: combined = [] plates = [os.path.basename(x) for x in pathlist] print(plates) for file in pathlist: combined.append(extractmap(file)) combined = pd.concat(combined, axis=0, sort=False) if 'plate' not in combined.columns: combined['plate'] = combined.index combined.plate = combined.plate.apply( lambda x: x.split(':')[0]) return combined
def extractgct(pathlist, split=True): """ automatically extract and concat dataframe and header files CARE MUST BE TAKEN THE FILES ARE OF THE SAME HEADER/MAP TYPE! the break argument will parse a single string of run-on gct paths and separate into a list of separate paths""" pathlist = gt.splitpaths(pathlist, ext='.gct') if not isinstance(pathlist, list): pathlist = [pathlist] if len(pathlist) == 1: if os.path.isdir(pathlist[0]): print(f'directory, getting all gcts') pathlist = gt.get_flist(pathlist[0], ext='.gct') else: d, h = builddframegct(pathlist[0]) return d, h dlist, hlist = [], [] for path in pathlist: print(path) d, h = builddframegct(path) dlist.append(d) hlist.append(h) d = pd.concat(dlist, axis=1) h = pd.concat(hlist, axis=0) print('samples (d/h): ', len(d.columns), len(h.index)) d.name = dlist[0].name + '+' return d, h
def merge96to384(flist): """pass in four txt/xlsx files named PCA102_A, PCA102_B... outputs full one in that dir layout: A B C D """ if isinstance(flist, str): if '.txt' in flist: ext = '.txt' elif '.xlsx' in flist: ext = '.xlsx' else: print('map extension error') flist = gt.splitpaths(flist, ext=ext) fullplate = pd.DataFrame() namelist = [] for b in string.ascii_uppercase[:4]: try: file = [x for x in flist if f'_{b}' in x][0] if ext == '.xlsx': m = pd.read_excel(file) elif ext == '.txt': m = pd.read_excel(file, sep='\t') if b == 'A': hdrs = m.columns except: print(f'batch {b} not present') m = pd.DataFrame(columns=hdrs) m['well'] = gt.well_range('A01', 'H12') m['type'] = 'empty' m['well'] = m['well'].copy().apply(lambda w: calc384well(w, b)) m['batch'] = b fullplate = pd.concat([fullplate, m], axis=0) namelist.append(os.path.split(file)[-1].split('_')[0]) fullplate.loc[fullplate['type'] == 'empty', 'batch'] = np.nan outdir = os.path.split(flist[0])[0] fullplate.sort_values('well', inplace=True) name = namelist[0] if len(set(namelist)) != 1: print("base plate names didn't agree") name += '_merged' fullplate.to_excel(os.path.join(outdir, name + '.xlsx'), index=False)
def plot_inv_level(flist, inv_level, scale=False): """ create plate plot for a given invariant level across plates, to see if any pattern can be either fullqnorm or reg qnorm """ flist = gt.splitpaths(flist, ext='.csv') for f in flist: d = gcsv.open_as_gct(f) shn = '_'.join(os.path.split(f)[-1].split('_')[:2]) if scale is False: vctr = d.loc[f'0_INV_{inv_level}'] elif scale is True: vctr = d.loc[f'0_INV_{inv_level}'] / d.loc['0_INV_10'] make_plateplot(vctr, name=shn + '-lvl' + str(inv_level))
def plot_lmconcs(flist, stack=False, test=False): """ plot lm concs with full output for all files in list, w/ optional joins""" if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') for f in flist: d, h = gct.extractgct(f) #ds, hs = gt.dsub(d, h, {'name':['5-Iodotubercidin', 'ERK5-IN-1']}) outpath = gt.dflt_outpath(fldr_name='landmark concs') plottools.plot_landmark_concs(d, h, genes='all', labels='wells', outpath=outpath, test=test) if stack is True: imgs.bulk_stack(outpath, orient='vert', delim='_', idx=2)
def compare_plate_genes(flist, genelist, numrows=3, type=True, plate=False, title='dflt', outpath='dflt', remove=True): """ plots the listed genes across the dataframes provided in list of file paths flist. the plots will be generated and then combined using img bulk stack. orient is direction of the joined images. type will include sample type color coding -- should add in grid support --- """ if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') if outpath == 'dflt': outpath = gt.dflt_outpath(fldr_name='tmp_imgs') for f in flist: d, h = gct.extractgct(f) for g in genelist: if plate is False: plot_gene_wtypes(d.loc[g], h, name=d.name + '-' + g, outpath=outpath) elif plate is True: make_plateplot(d.loc[g], name=d.name + '-' + g, outpath=outpath) if title != 'dflt': combined_outpath = gt.dflt_outpath(fldr_name=title + ' combined_imgs') else: combined_outpath = gt.dflt_outpath(fldr_name='combined_imgs') if numrows is False: imgs.bulk_stack(outpath, outpath=combined_outpath, delim='-', idx=1, pad=.05) else: imgs.bulk_stack_grid(outpath, outpath=combined_outpath, numrows=numrows, delim='-', idx=1, pad=.05) if remove is True: shutil.rmtree(outpath)
def plate_comparison(flist, scat=True, corr=True, dat=False): """ for a range of gct files consolidate median, mean, std, cv from each and then create pairwise scatterplots for each. flexible in number of plates """ if isinstance(flist, str): flist = gt.splitpaths(flist, '.gct') outpath = gt.dflt_outpath(fldr_name='comparisons') ddict = cll.OrderedDict() hdict = cll.OrderedDict() for i, f in enumerate(flist): name = gt.get_shn(f) df, h = gct.extractgct(f) ddict[name], hdict[name] = df, h if i == 0: baseindex = df.index medians = pd.DataFrame(index=baseindex) medians.name = 'median gene values' stdev = pd.DataFrame(index=baseindex) stdev.name = 'gene standard deviations' cv = pd.DataFrame(index=baseindex) cv.name = 'gene coefficient of variation' average = pd.DataFrame(index=baseindex) average.name = 'gene average' for n, d in ddict.items(): medians[n] = d.median(axis=1) stdev[n] = d.std(axis=1) cv[n] = d.std(axis=1) / d.mean(axis=1) average[n] = d.mean(axis=1) for dset in [medians, stdev, cv, average]: if scat is True: sns.pairplot(dset) plt.tight_layout() plt.suptitle(dset.name) plt.savefig(os.path.join(outpath, dset.name + 'scatter.png')) plt.close() if dat is True: dset.to_excel(os.path.join(outpath, dset.name + '.xlsx')) if corr is True: ax = plottools.plot_euclidean(dset, dset.columns) ax.set_title(dset.name) plt.tight_layout() plt.savefig(os.path.join(outpath, dset.name + 'matrix.png')) plt.close()
def load_sig(fpaths): """ load tuple sig of two files, either as one string or list of two paths """ if isinstance(fpaths, str): pathlist = gt.splitpaths(fpaths, ext='.grp') up_path = [x for x in pathlist if '_up' in x][0] dn_path = [x for x in pathlist if '_dn' in x][0] else: try: # print('signature load {} paths.'.format(len(fpaths))) up_path = [x for x in fpaths if '_up' in x][0] dn_path = [x for x in fpaths if '_dn' in x][0] except: print('error with load sig input') pathlist = ['foo', 'foo'] with open(up_path, 'r') as file: up = file.readlines() up = [x.strip() for x in up] with open(dn_path, 'r') as file2: dn = file2.readlines() dn = [x.strip() for x in dn] return (up, dn)