Beispiel #1
0
def openmap(path, ext='all'):
    """ bulk map opening, flexible by type, but watch out for mismatched dimensions of different maps """
    if isinstance(ext, str) and ext == 'all':
        #exts = ['.gct', '.txt', '.xlsx', '.xls']
        exts = ['.gct', '.txt', '.xlsx']
    pathlist = []
    if os.path.isdir(path):
        for extsn in exts:
            pathlist.extend(gt.get_flist(path, ext=extsn))
    else:
        pathlist = path
        for extsn in exts:
            pathlist = gt.splitpaths(pathlist, ext=extsn)
        if isinstance(pathlist, str):
            print('only one map')
            combined = extractmap(path)
            return combined
        else:
            combined = []
            plates = [os.path.basename(x) for x in pathlist]
            print(plates)
            for file in pathlist:
                combined.append(extractmap(file))
            combined = pd.concat(combined, axis=0, sort=False)
            if 'plate' not in combined.columns:
                combined['plate'] = combined.index
                combined.plate = combined.plate.apply(
                    lambda x: x.split(':')[0])
            return combined
Beispiel #2
0
def extractgct(pathlist, split=True):
    """ automatically extract and concat dataframe and header files
    CARE MUST BE TAKEN THE FILES ARE OF THE SAME HEADER/MAP TYPE!
    the break argument will parse a single string of run-on gct paths and
    separate into a list of separate paths"""

    pathlist = gt.splitpaths(pathlist, ext='.gct')

    if not isinstance(pathlist, list):
        pathlist = [pathlist]

    if len(pathlist) == 1:
        if os.path.isdir(pathlist[0]):
            print(f'directory, getting all gcts')
            pathlist = gt.get_flist(pathlist[0], ext='.gct')
        else:
            d, h = builddframegct(pathlist[0])
            return d, h

    dlist, hlist = [], []
    for path in pathlist:
        print(path)
        d, h = builddframegct(path)
        dlist.append(d)
        hlist.append(h)
    d = pd.concat(dlist, axis=1)
    h = pd.concat(hlist, axis=0)
    print('samples (d/h): ', len(d.columns), len(h.index))
    d.name = dlist[0].name + '+'

    return d, h
Beispiel #3
0
def merge96to384(flist):
    """pass in four txt/xlsx files named PCA102_A, PCA102_B... outputs full one in that dir
    layout: A   B
            C   D """
    if isinstance(flist, str):
        if '.txt' in flist:
            ext = '.txt'
        elif '.xlsx' in flist:
            ext = '.xlsx'
        else:
            print('map extension error')
        flist = gt.splitpaths(flist, ext=ext)

    fullplate = pd.DataFrame()

    namelist = []

    for b in string.ascii_uppercase[:4]:
        try:
            file = [x for x in flist if f'_{b}' in x][0]

            if ext == '.xlsx':
                m = pd.read_excel(file)
            elif ext == '.txt':
                m = pd.read_excel(file, sep='\t')

            if b == 'A':
                hdrs = m.columns

        except:
            print(f'batch {b} not present')
            m = pd.DataFrame(columns=hdrs)

            m['well'] = gt.well_range('A01', 'H12')
            m['type'] = 'empty'

        m['well'] = m['well'].copy().apply(lambda w: calc384well(w, b))
        m['batch'] = b

        fullplate = pd.concat([fullplate, m], axis=0)

        namelist.append(os.path.split(file)[-1].split('_')[0])

    fullplate.loc[fullplate['type'] == 'empty', 'batch'] = np.nan

    outdir = os.path.split(flist[0])[0]

    fullplate.sort_values('well', inplace=True)

    name = namelist[0]

    if len(set(namelist)) != 1:
        print("base plate names didn't agree")
        name += '_merged'

    fullplate.to_excel(os.path.join(outdir, name + '.xlsx'), index=False)
Beispiel #4
0
def plot_inv_level(flist, inv_level, scale=False):
    """ create plate plot for a given invariant level across plates, to see if any pattern
    can be either fullqnorm or reg qnorm """
    flist = gt.splitpaths(flist, ext='.csv')
    for f in flist:
        d = gcsv.open_as_gct(f)
        shn = '_'.join(os.path.split(f)[-1].split('_')[:2])
        if scale is False:
            vctr = d.loc[f'0_INV_{inv_level}']
        elif scale is True:
            vctr = d.loc[f'0_INV_{inv_level}'] / d.loc['0_INV_10']
        make_plateplot(vctr, name=shn + '-lvl' + str(inv_level))
Beispiel #5
0
def plot_lmconcs(flist, stack=False, test=False):
    """ plot lm concs with full output for all files in list, w/ optional joins"""
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    for f in flist:
        d, h = gct.extractgct(f)
        #ds, hs = gt.dsub(d, h, {'name':['5-Iodotubercidin', 'ERK5-IN-1']})
        outpath = gt.dflt_outpath(fldr_name='landmark concs')
        plottools.plot_landmark_concs(d,
                                      h,
                                      genes='all',
                                      labels='wells',
                                      outpath=outpath,
                                      test=test)
    if stack is True:
        imgs.bulk_stack(outpath, orient='vert', delim='_', idx=2)
Beispiel #6
0
def compare_plate_genes(flist,
                        genelist,
                        numrows=3,
                        type=True,
                        plate=False,
                        title='dflt',
                        outpath='dflt',
                        remove=True):
    """ plots the listed genes across the dataframes provided in list of file paths flist.
     the plots will be generated and then combined using img bulk stack. orient is direction
     of the joined images. type will include sample type color coding -- should add in grid support ---  """
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    if outpath == 'dflt':
        outpath = gt.dflt_outpath(fldr_name='tmp_imgs')
    for f in flist:
        d, h = gct.extractgct(f)
        for g in genelist:
            if plate is False:
                plot_gene_wtypes(d.loc[g],
                                 h,
                                 name=d.name + '-' + g,
                                 outpath=outpath)
            elif plate is True:
                make_plateplot(d.loc[g],
                               name=d.name + '-' + g,
                               outpath=outpath)
    if title != 'dflt':
        combined_outpath = gt.dflt_outpath(fldr_name=title + ' combined_imgs')
    else:
        combined_outpath = gt.dflt_outpath(fldr_name='combined_imgs')
    if numrows is False:
        imgs.bulk_stack(outpath,
                        outpath=combined_outpath,
                        delim='-',
                        idx=1,
                        pad=.05)
    else:
        imgs.bulk_stack_grid(outpath,
                             outpath=combined_outpath,
                             numrows=numrows,
                             delim='-',
                             idx=1,
                             pad=.05)
    if remove is True:
        shutil.rmtree(outpath)
Beispiel #7
0
def plate_comparison(flist, scat=True, corr=True, dat=False):
    """ for a range of gct files consolidate median, mean, std, cv from each and then
     create pairwise scatterplots for each. flexible in number of plates """
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    outpath = gt.dflt_outpath(fldr_name='comparisons')
    ddict = cll.OrderedDict()
    hdict = cll.OrderedDict()
    for i, f in enumerate(flist):
        name = gt.get_shn(f)
        df, h = gct.extractgct(f)
        ddict[name], hdict[name] = df, h
        if i == 0:
            baseindex = df.index
    medians = pd.DataFrame(index=baseindex)
    medians.name = 'median gene values'
    stdev = pd.DataFrame(index=baseindex)
    stdev.name = 'gene standard deviations'
    cv = pd.DataFrame(index=baseindex)
    cv.name = 'gene coefficient of variation'
    average = pd.DataFrame(index=baseindex)
    average.name = 'gene average'
    for n, d in ddict.items():
        medians[n] = d.median(axis=1)
        stdev[n] = d.std(axis=1)
        cv[n] = d.std(axis=1) / d.mean(axis=1)
        average[n] = d.mean(axis=1)
    for dset in [medians, stdev, cv, average]:
        if scat is True:
            sns.pairplot(dset)
            plt.tight_layout()
            plt.suptitle(dset.name)
            plt.savefig(os.path.join(outpath, dset.name + 'scatter.png'))
            plt.close()
        if dat is True:
            dset.to_excel(os.path.join(outpath, dset.name + '.xlsx'))
        if corr is True:
            ax = plottools.plot_euclidean(dset, dset.columns)
            ax.set_title(dset.name)
            plt.tight_layout()
            plt.savefig(os.path.join(outpath, dset.name + 'matrix.png'))
            plt.close()
Beispiel #8
0
def load_sig(fpaths):
    """ load tuple sig of two files, either as one string or list of two paths """
    if isinstance(fpaths, str):
        pathlist = gt.splitpaths(fpaths, ext='.grp')
        up_path = [x for x in pathlist if '_up' in x][0]
        dn_path = [x for x in pathlist if '_dn' in x][0]
    else:
        try:
            # print('signature load {} paths.'.format(len(fpaths)))
            up_path = [x for x in fpaths if '_up' in x][0]
            dn_path = [x for x in fpaths if '_dn' in x][0]
        except:
            print('error with load sig input')
            pathlist = ['foo', 'foo']

    with open(up_path, 'r') as file:
        up = file.readlines()
        up = [x.strip() for x in up]

    with open(dn_path, 'r') as file2:
        dn = file2.readlines()
        dn = [x.strip() for x in dn]

    return (up, dn)