Exemple #1
0
def grab_ranks(path, feat, hilo=1, t=40):
    """ survey folder for the wells in whic given gene istop ranked wells of given feat by sorted z score). generates
        overall list ofdefault rank output is in descending order, highest zscore = 1
        hilo: 1 = high, upregulated genes (default rank order)
                    0 = low, downnregulated genes """
    outpath = os.path.join(path, '_rank_summary.txt')
    flist = gt.get_flist(path, 'ranks.gct')
    # set dummy starting point for low rank
    lowest = 500
    # create blank template dataframe
    summary = pd.DataFrame()
    for f in flist:
        d, h = gct.extractgct(f)
        # flip rank order as needed
        if hilo > 1:
            d = 978 - d
        # get column ids for ranks below threshold
        wells = d.columns[d.ix[feat] < t]
        # extract portion of dataframe
        ranks = d.ix[feat, wells]
        ranks = pd.DataFrame(ranks)
        # assign plate column to each well id entry, re-order cols
        ranks['plate'] = gt.get_shn(f).split('-')[0]
        # concat portion to overall dataframe
        summary = pd.concat([summary, ranks])
        # check and store the lowest rank
        newlow = min(d.ix[feat])
        if newlow < lowest:
            lowest = newlow
    # re-shuffle the column order
    summary['well'] = summary.index.values
    summary = summary[['plate', 'well', feat]]
    print('\n', feat, int(lowest))
    summary.to_csv(outpath, sep='\t', index=None)
Exemple #2
0
def get_s2n_genes(g, c1, c2):
    d, h = gct.extractgct(g)
    c1w = gt.hsub(h, {'well': c1}).index
    c2w = gt.hsub(h, {'well': c2}).index
    d1 = d[c1w]
    d2 = d[c2w]
    res = sig_to_noise(d1, d2)
    return res
Exemple #3
0
def run_plate_analysis(mode='ind', cats='nd', path='dflt'):
    """ runs standard analysis on either each plate individually 'ind' or all togegther 'comb'
    most useful for plates with doses. the default loc  

    default path will be newQC on the desktop """

    path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC'))

    fl = gt.globit(path, '*ZSVCQNORM*')

    print(fl)

    if mode == 'comb':
        dl, hl = [], []
        for i, f in enumerate(fl):
            d, h = gct.extractgct(f)
            if i == 0:
                try:
                    pname = d.name + '+'
                except:
                    pname = h.addr[0].split(':')[0] + '+'
            if len(h.batch.unique()) > 1:
                # fix sample labels for plate/batch
                h.plate = h.plate + h.batch
            # define labels (should I add plate?)
            h = gt.gen_label(h, cats)
            dl.append(d)
            hl.append(h)
        try:
            d = pd.concat(dl, axis=1)
            d.name = pname
        except ValueError:
            sys.exit('no gct file plates to analyze')
        h = pd.concat(hl, axis=0)

        analyze_plate(d, h, cats)

    elif mode == 'ind':
        for f in fl:
            d, h = gct.extractgct(f)
            # define labels (should I add plate?)
            h = gt.gen_label(h, cats)

            analyze_plate(d, h, cats)
Exemple #4
0
def run_granularity(path):
    flist = gt.get_flist(path, '.gct')
    c = cll.Counter()
    for f in flist:
        d, h = gct.extractgct(f)
        survey_granularity(d, c)
    c = pd.Series(c, name='count')
    c.sort_values(ascending=False, inplace=True)
    c = c[c > 1]
    c.to_excel(os.path.join(path, 'counter.xlsx'))
Exemple #5
0
def assemble_ref_dat(path):
    """ to gather together all reference RNA wells within the given path """
    fl = gt.globit(path, '*_ref_n*')
    dl, hl = [], []
    for f in fl:
        dr, hr = gct.extractgct(f)
        dr, hr = gt.dsub(dr, hr, {'well':['A02','B02']})
        dr = round(dr, 2)
        dl.append(dr)
        hl.append(hr)
    alldata = pd.concat(dl, axis=1)
    return alldata
Exemple #6
0
def plot_lmconcs(flist, stack=False, test=False):
    """ plot lm concs with full output for all files in list, w/ optional joins"""
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    for f in flist:
        d, h = gct.extractgct(f)
        #ds, hs = gt.dsub(d, h, {'name':['5-Iodotubercidin', 'ERK5-IN-1']})
        outpath = gt.dflt_outpath(fldr_name='landmark concs')
        plottools.plot_landmark_concs(d,
                                      h,
                                      genes='all',
                                      labels='wells',
                                      outpath=outpath,
                                      test=test)
    if stack is True:
        imgs.bulk_stack(outpath, orient='vert', delim='_', idx=2)
Exemple #7
0
def compare_plate_genes(flist,
                        genelist,
                        numrows=3,
                        type=True,
                        plate=False,
                        title='dflt',
                        outpath='dflt',
                        remove=True):
    """ plots the listed genes across the dataframes provided in list of file paths flist.
     the plots will be generated and then combined using img bulk stack. orient is direction
     of the joined images. type will include sample type color coding -- should add in grid support ---  """
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    if outpath == 'dflt':
        outpath = gt.dflt_outpath(fldr_name='tmp_imgs')
    for f in flist:
        d, h = gct.extractgct(f)
        for g in genelist:
            if plate is False:
                plot_gene_wtypes(d.loc[g],
                                 h,
                                 name=d.name + '-' + g,
                                 outpath=outpath)
            elif plate is True:
                make_plateplot(d.loc[g],
                               name=d.name + '-' + g,
                               outpath=outpath)
    if title != 'dflt':
        combined_outpath = gt.dflt_outpath(fldr_name=title + ' combined_imgs')
    else:
        combined_outpath = gt.dflt_outpath(fldr_name='combined_imgs')
    if numrows is False:
        imgs.bulk_stack(outpath,
                        outpath=combined_outpath,
                        delim='-',
                        idx=1,
                        pad=.05)
    else:
        imgs.bulk_stack_grid(outpath,
                             outpath=combined_outpath,
                             numrows=numrows,
                             delim='-',
                             idx=1,
                             pad=.05)
    if remove is True:
        shutil.rmtree(outpath)
Exemple #8
0
def predict_cells(input, save=False):
    """ can accept directory and loop through files or one dataframe at a time,
     uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction
     when save is True a dataframe will be saveh"""

    with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p',
              'rb') as file:
        clf = pickle.load(file)
    if isinstance(input, str):
        if os.path.isdir(input):
            vlist = gt.globit(input, '*_Qctrl_n*')
            if len(vlist) == 0:
                vlist = gt.globit(input, '*QNORM*')
        else:
            vlist = [input]
    elif isinstance(input, pd.Series):
        try:
            res = clf.predict([input])[0]
        except:
            print('error with series prediction')
            res = None
        return res
    else:
        vlist = input
    res_table = pd.DataFrame()
    for f in vlist:
        try:
            d, h = gct.extractgct(f)
        except:
            vlist[0] = d
            vlist[1] = h
        ds, hs = gt.dsub(d, h, {'type': 'vehicle'})
        if len(ds) == 0:
            print('error, maybe using ZS file? use QNORM instead')
            return None
        for b in hs.batch.unique():
            dsb, hsb = gt.dsub(ds, hs, {'batch': b})
            med = dsb.median(axis=1).values
            shn = gt.get_shn(f) + '-' + b
            res = clf.predict([med])[0]
            res_table.loc[shn, 'cell'] = res
            print(f'{shn} - {res}')
    if save is True:
        res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t')
    return res_table
Exemple #9
0
def plate_comparison(flist, scat=True, corr=True, dat=False):
    """ for a range of gct files consolidate median, mean, std, cv from each and then
     create pairwise scatterplots for each. flexible in number of plates """
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    outpath = gt.dflt_outpath(fldr_name='comparisons')
    ddict = cll.OrderedDict()
    hdict = cll.OrderedDict()
    for i, f in enumerate(flist):
        name = gt.get_shn(f)
        df, h = gct.extractgct(f)
        ddict[name], hdict[name] = df, h
        if i == 0:
            baseindex = df.index
    medians = pd.DataFrame(index=baseindex)
    medians.name = 'median gene values'
    stdev = pd.DataFrame(index=baseindex)
    stdev.name = 'gene standard deviations'
    cv = pd.DataFrame(index=baseindex)
    cv.name = 'gene coefficient of variation'
    average = pd.DataFrame(index=baseindex)
    average.name = 'gene average'
    for n, d in ddict.items():
        medians[n] = d.median(axis=1)
        stdev[n] = d.std(axis=1)
        cv[n] = d.std(axis=1) / d.mean(axis=1)
        average[n] = d.mean(axis=1)
    for dset in [medians, stdev, cv, average]:
        if scat is True:
            sns.pairplot(dset)
            plt.tight_layout()
            plt.suptitle(dset.name)
            plt.savefig(os.path.join(outpath, dset.name + 'scatter.png'))
            plt.close()
        if dat is True:
            dset.to_excel(os.path.join(outpath, dset.name + '.xlsx'))
        if corr is True:
            ax = plottools.plot_euclidean(dset, dset.columns)
            ax.set_title(dset.name)
            plt.tight_layout()
            plt.savefig(os.path.join(outpath, dset.name + 'matrix.png'))
            plt.close()
Exemple #10
0
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False):
    """" for the path load all files and collapse vehicles, plot matrix
    batches can be all or 'A' only to just take the first one. getcells will re-predict cells """
    path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC'))

    flv = gt.globit(path, '*Qctrl*')
    if len(flv) == 0:
        flv = gt.globit(path, '*_QNORM_*')

    # should put in a check to extract from regular qnorms
    dlist, hlist = [], []
    for f in flv:
        d, h = gct.extractgct(f)
        h['plate'] = h['plate'].apply(lambda x: x[:6])
        d, h = gt.dsub(d, h, {'type': 'vehicle'})
        if batch == 'all':
            for b in h.batch.unique():
                ds, hs = gt.dsub(d, h, {'batch': b})
                med = ds.median(axis=1)
                hs = gt.gen_label(hs, 'pb', delim=delim)
                dlist.append(med)
                hlist.append(hs.iloc[0])
        elif batch == 'A':
            ds, hs = gt.dsub(d, h, {'batch': 'A'})
            med = ds.median(axis=1)
            hs = gt.gen_label(hs, 'pb', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])
        else:
            med = d.median(axis=1)
            hs = gt.gen_label(hs, 'p', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])

    vdf = pd.concat(dlist, axis=1)
    vh = pd.DataFrame(hlist)
    vdf.columns = vh.label
    if getcells is True:
        vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x]))
        vh['label'] = vh.label + delim + vh.cell2
    vdf.columns = vh.label
    return vdf, vh
Exemple #11
0
def get_zscore(fpath, save=True, my_mad=None):
    """ merged from separate zscore file. can either save the resulting file or return data
    the first fpath argument can be a file path or a [d, h] object already"""
    # basic setup
    if isinstance(fpath, str):
        g = gct.Gct(fpath)
        g.get_headers()
        df, h = gct.extractgct(fpath)
    else:
        try:
            df = fpath[0]
            h = fpath[1]
        except:
            print('error with path')

    zsd = cll.defaultdict(dict)
    pname = gt.get_shn(fpath)

    for b in h['batch'].dropna().unique():
        if b == 'na':
            continue
        print('running zscore for {} batch {}'.format(pname, b))
        vw = gt.hsub(h, {'batch': b, 'type': 'vehicle'}).index.values
        if len(vw) == 0:
            break
        veh = df[vw]
        # get median value across vehicle populations
        med = veh.median(axis=1)

        # populate the absolute deviation values per gene
        ad = cll.defaultdict(list)
        for v in veh.columns:
            for f in veh.index:
                ad[f].append(abs(med[f] - veh[v][f]))
        # assemble the median absolute value per gene
        mad = {}
        for k, v in ad.items():
            r = statistics.median(v)
            if 0 < r < 0.1:
                r = 0.1
            mad[k] = r
        # using the above progress though test and poscon wells
        # to calculate sample zscores
        tw = list(h[(h['batch'] == b) & (h['type'] == 'test')].index.values)
        pw = list(h[(h['batch'] == b) & (h['type'] == 'poscon')].index.values)
        wells = tw + pw
        for w in df[wells].columns:
            for feat in df.index:
                if my_mad is not None and mad[feat] < my_mad:
                    zs = (df[w][feat] - med[feat]) / (my_mad * 1.486)
                elif mad[feat] == 0:
                    zs = 0
                else:
                    zs = (df[w][feat] - med[feat]) / (mad[feat] * 1.486)
                zsd[w][feat] = '{0:.3f}'.format(zs)

    # transform into dataframe, set index, null nonsense
    zsdf = pd.DataFrame(zsd)
    hs = h.loc[zsdf.columns]
    zsdf = zsdf.replace(['inf', '-inf'], np.nan).fillna('nan')
    if save is True:
        outpath = '{}_ZS.gct'.format(fpath.split('_', 1)[0])
        gct.save_headergct(zsdf, hs, outpath)
    else:
        return zsdf, hs
Exemple #12
0
def assemble_consensus(df,
                       h,
                       cats,
                       sc=True,
                       legend='brief',
                       plot=False,
                       skyl=False,
                       n=None,
                       save=False,
                       ret=True,
                       test=False):
    """ tool to assemble replicate zscore consensus, pass df, header and the breakdown categories 'nd' for instance
    will return the consolidated df and header file

    can pass in multiple gct files as a single string via F6 shortcut

    ccs will calculate the zscore correlation of replicates, and insert that into header df
    plot will use seaborn pairplot to visualize the calculated rep correlations above
    skyl controls skyline plot generation, can be True to plot all ind reps plus consensus
    n argument is a limiter to only consider treatments with enough replicates, including into consensus gct!!
    save will save the consensus gct file
    """

    if isinstance(df, str):
        df, h = gct.extractgct(df)
    else:
        print('error in loading dataframe')

    outpath = gt.dflt_outpath(fldr_name='output figs')
    try:
        pname = df.name
    except:
        pname = h.addr[0].split(':')[0]
    outpath = os.path.join(outpath, pname)
    try:
        os.mkdir(outpath)
    except:
        pass

    subs = breakdown(df, h, cats, dic=False)

    con_data = pd.DataFrame(index=df.index)
    addnl = []
    addnl.extend(['corr', 'all ccs'])
    addnl.extend(['prom', 'all proms', 'porder'])

    if addnl != []:
        con_header = pd.DataFrame(index=list(h.columns.values) + addnl)
    else:
        con_header = pd.DataFrame(index=h.columns)

    for ds, hs in subs:
        if n is not None:
            if len(ds.columns) < n:
                print('not enough reps', hs.iloc[0])
                continue

        c = consensus(ds, name='first')
        con_data = pd.concat([con_data, c], axis=1)

        new_annot = hs.iloc[0, :].copy().T
        new_annot.well = hs['well'].values
        new_annot.addr = hs['addr'].values

        corrs = []
        for i in range(len(ds.columns)):
            for j in range(1 + i, len(ds.columns)):
                corrs.append(
                    round(ds.iloc[:, i].corr(ds.iloc[:, j], method='pearson'),
                          2))
        if len(corrs) == 0:
            # print('corrs = na')
            # print(hs.iloc[0].values)
            new_annot['corr'] = np.nan
            new_annot['all ccs'] = np.nan
        elif len(corrs) == 1:
            new_annot['corr'] = round(corrs[0], 2)
            new_annot['all ccs'] = corrs
        else:
            new_annot['corr'] = round(np.percentile(corrs, 75), 2)
            new_annot['all ccs'] = corrs
        corrs = [decimal.Decimal(x) for x in corrs]
        new_annot['corr'] = pd.to_numeric(new_annot['corr'])

        proms = abs(ds).sum(axis=0).round().values
        porder = hs['well'].values
        new_annot['prom'] = round(np.percentile(proms, 75))
        new_annot['all proms'] = proms
        new_annot['porder'] = porder

        if plot is True:
            ds.columns = [x + ' - ' + hs.loc[x]['batch'] for x in ds.columns]
            ax = sns.pairplot(ds)
            myoutpath = os.path.join(outpath, 'rep zs scatter')
            try:
                os.mkdir(myoutpath)
            except:
                pass
            plt.savefig(
                os.path.join(myoutpath, h.plate[0] + '-' + ds.name + '.png'))
            plt.close()

        con_header = pd.concat([con_header, new_annot], axis=1)

        if skyl is True:
            myoutpath = os.path.join(outpath, 'skyline')
            try:
                os.mkdir(myoutpath)
            except:
                pass
            try:
                name = hs.iloc[0]['name'] + '-' + str(
                    hs.iloc[0]['dose']) + '-' + hs.iloc[0]['batch']
            except:
                name = hs.iloc[0]['name'] + '-' + hs.iloc[0]['batch']
            name = name.replace('.', ',')
            title = pname + '-' + name
            myoutpath = os.path.join(myoutpath, title)
            skyline.new_skyline(ds, title=title, outpath=myoutpath)

        if test is True:
            break

    con_header = con_header.T

    if sc is True:
        try:
            pname = df.name
        except:
            pname = h.addr[0].split(':')[0]
        title = pname + ' sc plot'
        outpath = gt.dflt_outpath(fn=pname + '_scplot.png')
        kwargs = {'x': 'corr', 'y': 'prom', 'data': con_header}

        kwargs.update({'alpha': .75, 'style': 'type', 'legend': legend})

        if 'd' in cats:
            kwargs['hue'] = 'name'
            kwargs['size'] = 'dose'
            kwargs['sizes'] = (40, 400)

        # this is experimental
        else:
            kwargs['sizes'] = (50)
            kwargs['hue'] = 'name'

        g = sns.relplot(**kwargs)
        g.fig.suptitle(title)
        g.fig.set_size_inches(7, 5)
        if legend is not None:
            for lh in g._legend.legendHandles:
                lh.set_alpha(.75)
        g.savefig(outpath, bbox_inches='tight')
        plt.close()

        con_header = gt.gen_label(con_header, 'nb')
        newfig, newax = dim_reduct.seaborn_scatter(con_header,
                                                   title,
                                                   outpath,
                                                   x='corr',
                                                   y='prom',
                                                   ptype='ax',
                                                   save=False)
        dim_reduct.html_scatter(newfig, newax, con_header, 'corr', 'prom',
                                title)
        plt.close()

    con_data.name = df.name

    if save is True:
        gct.save_headergct(con_data, con_header,
                           gt.dflt_outpath(fn=df.name + '_consensus.gct'))
    if ret is True:
        return con_data, con_header