Ejemplo n.º 1
0
Archivo: qc.py Proyecto: wrbutton/foo
def run_ref_pca(path='dflt', data='dflt', save=True, label=True):
    """ requires pointing to a pre-assembled data file to reference new pca against"""
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='newQC')
    refdf = assemble_ref_dat(path)
    ns = len(refdf.columns)
    if isinstance(data, str):
        if data == 'dflt':
            dat_file = '/Users/WRB/Dropbox/Areas of Focus/_Genometry/Analysis Projects/reference_ref_pca.csv'
            bulk_data = pd.read_csv(dat_file, delimiter='\t', index_col=0)
    else:
        bulk_data = data
    try:
        newcols = [x for x in refdf.columns if 'B01' not in x]
        refdf = refdf[newcols]
    except:
        pass
    alldata = pd.concat([bulk_data, refdf], axis=1)
    pca = decomposition.PCA(n_components=2).fit_transform(alldata.T)
    fig, ax = plt.subplots()
    ax.scatter(pca[:, 0], pca[:, 1], c='grey')
    ax.scatter(pca[-ns:, 0], pca[-ns:, 1], c='red')

    if label is True:
        for l, x, y in zip(refdf.columns.values, pca[-ns:, 0], pca[-ns:, 1]):
            plt.annotate(l, xy=(x, y), xytext=(-2,2),
                         textcoords='offset points', ha='right', va='bottom')

    ax.set_title('REF RNA PCA')
    #pca = pd.DataFrame(pca, index=alldata.columns, columns=['comp1','comp2'])
    if save is True:
        outpath = gt.dflt_outpath(fldr_name=None, fn='pca.png')
        plt.savefig(outpath)
Ejemplo n.º 2
0
def make_plots_from_list(d,
                         h,
                         welllist,
                         cats='nd',
                         outpath='dflt',
                         test=False):
    """ take a list of well ids (can be 3 char) and gets the matching reps with provided
    cats, and then plots skylines from the reps """
    pname = d.name
    if outpath == 'dflt':
        outpath = gt.dflt_outpath()
    for w in welllist:
        wells = gt.get_well_reps(h, w, cats, df=True)
        try:
            if wells == 'empty':
                continue
        except ValueError:
            pass
        name = wells.iloc[0]['name'] + '-' + wells.iloc[0]['dose']
        name = name.replace('.', ',')
        title = pname + '-' + name
        wids = wells.index.values
        wids = [x for x in wids if x in d.columns]
        myoutpath = os.path.join(outpath, title)
        new_skyline(d[wids], title=title, outpath=myoutpath)
        if test is True:
            print('test mode, exiting after one image')
            break
Ejemplo n.º 3
0
def analyze_plate(d, h, cats):
    """ take a dataset (assuming with doses) and generate standard output figs """
    # should worry about organizing output too

    # create consensus, create SC plot
    dc, hc = pa.assemble_consensus(d, h, cats, save=True, sc=True)
    hc = gt.gen_label(hc, 'nd')
    dc.name = d.name

    # tSNE simple first pass, two parameters
    dim_reduct.tsne2(dc, hc, px=10, lr=[10, 150], inter=True)

    # create general dendrogram (only, no heatmap)
    dim_reduct.make_dendrogram(dc, labels=hc.label, outpath=True)

    # plot correlation matrix of the sorted combined zs by name and dose
    # can then follow up to plot the sweep or the clustered
    plottools.plot_correlation_matrix(dc,
                                      hc,
                                      title='dflt',
                                      sort=True,
                                      outpath=True,
                                      sparselabel=True,
                                      grid=True,
                                      labels=hc.name)

    if 'd' in cats:
        # plot landmark concs
        newcats = cats.replace('d', '')
        outpath = gt.dflt_outpath(fldr_name='landmark concs')
        #plottools.plot_landmark_concs(dc, hc, cats=newcats, genes='all', labels='dose', outpath=outpath)

        # call combo function to find genes that move and plot those dose-response plots (30 per plate)
        plottools.plot_ex_genes(d, h, n=10, mode='med')
Ejemplo n.º 4
0
Archivo: sigs.py Proyecto: wrbutton/foo
def make_barview_range(edf,
                       argdict,
                       across='dose',
                       label=False,
                       outpath=False):
    """ with enrichment score results, plot barviews across the range of conditions, default dose """

    cond_range = sorted(gt.hsub(edf, argdict)[across].unique())

    print(argdict.values())
    mytitle = ' '.join(argdict.values())

    fig, axarr = plt.subplots(1, len(cond_range), sharey='row')

    for i, cond in enumerate(cond_range):
        my_ax = axarr[i]
        new_argdict = argdict
        if across is not None:
            new_argdict[across] = cond
        if label is True:
            make_barview(edf, new_argdict, ax=my_ax, label=cond)
        else:
            make_barview(edf, new_argdict, ax=my_ax)

    #fig.subplots_adjust(hspace=0.5)

    plt.suptitle(mytitle)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    if outpath is True:
        outpath = gt.dflt_outpath(fldr_name='foo')
        myoutpath = os.path.join(outpath, mytitle + '_enrich.png')
        plt.savefig(myoutpath)
        plt.close()
Ejemplo n.º 5
0
Archivo: sigs.py Proyecto: wrbutton/foo
def bulk_test_enrich(sig, df, h, outpath=False):
    """ pass in a dataframe and a signature (tuple of up/down), and optionally map/header
    information to include in the returned enrichment scores"""
    up, dn = sig[0], sig[1]
    escore = {}
    for c in df.columns:
        escore[c] = test_enrichment(df[c], up, dn)
    edf = pd.DataFrame(escore)
    edf = edf.T
    # create local scaled enrichment
    pmax = edf['absolute'].max()
    pmin = edf['absolute'].min()
    edf['scaled'] = edf['absolute'].apply(lambda x: x / pmax if x > 0 else
                                          (x / (-1 * pmin)) if x < 0 else 0)
    edf['scaled'] = edf['scaled'].apply(lambda x: float('{:.3f}'.format(x)))
    edf = edf[['scaled', 'absolute', 'up', 'dn']]
    # optionally merge results with sample header obj
    if h is not None:
        edf = pd.merge(h, edf, left_index=True, right_index=True, how='inner')
    edf.sort_values('scaled', ascending=False, inplace=True)
    if outpath is not False:
        if outpath == 'dflt':
            outpath = gt.dflt_outpath(fn=df.name + '_enrichment.xlsx')
        edf.to_excel(outpath)
    return edf
Ejemplo n.º 6
0
def plot_gene(sample_set,
              h=None,
              name='dflt',
              outpath='dflt',
              close=True,
              width=8):
    """ basic plot gene finction, if header is provided will apply color coding blue = veh, red = poscon """
    if name == 'dflt':
        name = sample_set.name
    if outpath == 'dflt':
        outpath = gt.dflt_outpath(fldr_name='dflt')
    xrange = len(list(sample_set))
    dtype = check_plottype(sample_set)
    #print('dtype is ', dtype)
    ax = format_concentration_plot(xrange, ptype=dtype, width=width)
    ax.scatter(range(xrange), sample_set.values, color='grey')
    ax.set_title(name)
    if h is not None:
        h['order'] = np.arange(1, len(h) + 1)
        dv, hv = gt.dsub(sample_set, h, {'type': 'vehicle'})
        ax.scatter(hv.order, dv.values, color='blue')
        dp, hp = gt.dsub(sample_set, h, {'type': 'poscon'})
        ax.scatter(hp.order, dp.values, color='red')
    if close is True:
        plt.savefig(os.path.join(outpath, name + '.png'))
        plt.close()
    else:
        return ax
Ejemplo n.º 7
0
Archivo: qc.py Proyecto: wrbutton/foo
def check_data(path='dflt'):
    """ a better final map checker """
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='finaldata')

    flist = gt.get_flist(path, ext='.gct')
    maplist = gt.get_flist(path, ext='.txt')
    maplist.extend((gt.get_flist(path, ext='.xlsx')))

    for f in flist:
        shn = gt.get_shn(f).split('.')[0]
        try:
            mapfile = [x for x in maplist if shn in x][0]
        except:
            print(f'error with map file {shn}')

        g = gct.Gct(f)
        g.get_headers()
        g.get_wells()
        datwells = g.wells

        mymap = gct.extractmap(mapfile)

        mapwells = gt.hsub(mymap, {'type':['vehicle', 'poscon', 'test']})['well'].values

        res = set(datwells) - set(mapwells)

        if len(res) == 0:
            print(f'{shn} ok, {380-len(datwells)} failed wells')
        else:
            print(f'eror with map/data {shn}, {len(datwells)}/{len(mapwells)}')
Ejemplo n.º 8
0
def gen_euclideans(df, labels='dflt', rot=None, tick_denom=1, test=False):
    """ loops plotting euclidean matrix to use different upper trim boundaries and font sizes for labels """

    outdir = gt.dflt_outpath(fldr_name='matrices')

    try:
        name = df.name
    except:
        name = df.columns[0].split(':')[0]

    Y = get_euclidean(df)

    maxv = round(Y.max())

    lims = [1, .75, .5, .3, .15]

    for fs in [8, 5]:
        for ul in lims:
            cap = int(round(maxv * ul))
            ax = plot_euclidean(df,
                                labels=labels,
                                upper=cap,
                                fontsize=fs,
                                dat=Y,
                                tick_denom=tick_denom,
                                rot=rot)
            outpath = os.path.join(
                outdir,
                name + f'_euclidean_ul{str(ul).replace(".",",")}-fs{fs}.png')
            #plt.savefig(outpath, bbox_inches='tight')
            plt.savefig(outpath)
            plt.close()
            if test is True:
                sys.exit('test mode, quitting after one')
Ejemplo n.º 9
0
Archivo: qc.py Proyecto: wrbutton/foo
def check_final(path='dflt'):
    """ check numbers of row/columns, number of fails and decimal places of final data files """
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='finaldata')

    f_list = gt.get_flist(path, ext='.gct')

    for file in f_list:
        g = gct.Gct(file)
        g.get_headers()
        try:
            txt = g.file.split('.')[0] + '.txt'
        except:
            try:
                txt = g.file.split('.')[0] + '.xlsx'
            except:
                pass

        try:
            print(sub_check_failed(g, txt))
            fails, fail_result = sub_check_failed(g, txt)
            result = sub_check_lines(g) and sub_check_columns(g) and fail_result
            dplaces = sub_check_decimal(g)
        except FileNotFoundError:
            result = False
            fails = 'no map!!'

        print('{} - {} - {} failed wells - {} dplaces'.format(g.shortname, result, fails, dplaces))
Ejemplo n.º 10
0
def compare_plate_genes(flist,
                        genelist,
                        numrows=3,
                        type=True,
                        plate=False,
                        title='dflt',
                        outpath='dflt',
                        remove=True):
    """ plots the listed genes across the dataframes provided in list of file paths flist.
     the plots will be generated and then combined using img bulk stack. orient is direction
     of the joined images. type will include sample type color coding -- should add in grid support ---  """
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    if outpath == 'dflt':
        outpath = gt.dflt_outpath(fldr_name='tmp_imgs')
    for f in flist:
        d, h = gct.extractgct(f)
        for g in genelist:
            if plate is False:
                plot_gene_wtypes(d.loc[g],
                                 h,
                                 name=d.name + '-' + g,
                                 outpath=outpath)
            elif plate is True:
                make_plateplot(d.loc[g],
                               name=d.name + '-' + g,
                               outpath=outpath)
    if title != 'dflt':
        combined_outpath = gt.dflt_outpath(fldr_name=title + ' combined_imgs')
    else:
        combined_outpath = gt.dflt_outpath(fldr_name='combined_imgs')
    if numrows is False:
        imgs.bulk_stack(outpath,
                        outpath=combined_outpath,
                        delim='-',
                        idx=1,
                        pad=.05)
    else:
        imgs.bulk_stack_grid(outpath,
                             outpath=combined_outpath,
                             numrows=numrows,
                             delim='-',
                             idx=1,
                             pad=.05)
    if remove is True:
        shutil.rmtree(outpath)
Ejemplo n.º 11
0
def html_scatter(fig, ax, h, x_col, y_col, title, labels='dflt', outpath='dflt'):
    outpath = gt.check_dfltarg(outpath, gt.dflt_outpath('foo'))
    labels = gt.check_dfltarg(labels, h.label)
    scatter = ax.scatter(h[x_col].tolist(), h[y_col].tolist(), alpha=0.001)
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)
    myoutpath = os.path.join(outpath, title + '.html')
    mpld3.save_html(fig, myoutpath)
Ejemplo n.º 12
0
Archivo: qc.py Proyecto: wrbutton/foo
def distribute_qc(path = 'dflt'):
    if path is 'dflt':
        inpath = gt.dflt_outpath(fldr_name='newQC')
        outpath = gt.dflt_outpath(fldr_name='QCprocessing')

    folders = ['calibs', 'flogps', 'escore', 'cellid-nolabel', 'cellid-label', 'euclidean']
    folders = [os.path.join(outpath, x) for x in folders]
    srch_terms = ['finalqc/*calibplot', 'finalqc/*FLOGP', 'escore_summary*/', '*cell_line/*cellid_nolabel/*-*cellid_circle',
                  '*cell_line/*-*cellid_circle', '-*euclidean']

    for term, fold in zip(srch_terms, folders):
        try:
            os.makedirs(fold)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        srch = '*'.join(['', term, '' ]) + '.png'
        for file in gt.globit(inpath, srch):
            shutil.copy(file, fold)
Ejemplo n.º 13
0
def plot_kmeans_clusters(df, cat_dict):
    """ plot correlation matrices for a dataset according to the passed label dictionary
    this is designed to be used with the dictionary output of 'kmeans_clusters' above"""

    outpath = gt.dflt_outpath(fn=df.name)

    for cat, labels in cat_dict.items():

        myoutpath = outpath + f'_{cat}_clusters.png'

        plottools.plot_correlation_matrix(df, labels=labels, sort=True, outpath=myoutpath, sparselabel=True)
Ejemplo n.º 14
0
def run_methods(df, h, outpath='dflt', hdr='dflt', hue='name', mets='dflt', shape=None, labels='dflt', scaling=None):
    """ run a selection of alternate dimension reduction techniques"""
    method_results = dict()

    n_neighbors = 15

    try:
        hdr = gt.check_dfltarg(hdr, df.name)
    except:
        hdr = df.columns[0].split(':')[0]
    outpath = gt.check_dfltarg(outpath, gt.dflt_outpath(fldr_name='dim_reduct'))
    labels = gt.check_dfltarg(labels, h.label)
    mets = gt.check_dfltarg(mets, ['PCA','ISO','MDS','LLE'])

    if ':' in df.columns.values[0]:
        df = df.T

    if scaling is not None:
        if scaling == 'maxabs':
            df = MaxAbsScaler().fit_transform(df)
        elif scaling == 'robust':
            df = RobustScaler().fit_transform(df)
        elif scaling == 'std':
            df = StandardScaler(with_mean=False).fit_transform(df)
        else:
            print('error with scaling')

    if 'PCA' in mets:
        # Projection on to the first 2 principal components
        method_results['PCA'] = decomposition.PCA(n_components=2).fit_transform(df)

    if 'ISO' in mets:
        # Isomap projection of the digits dataset
        method_results['ISO'] = manifold.Isomap(n_neighbors, n_components=2).fit_transform(df)

    if 'MDS' in mets:
        # MDS  embedding of the digits dataset
        method_results['MDS'] = manifold.MDS(n_components=2, n_init=1, max_iter=100).fit_transform(df)

    if 'LLE' in mets:
        # Locally linear embedding
        method_results['LLE'] = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                                            method='modified').fit_transform(df)

    for met, dat in method_results.items():
        xcol, ycol = f'{met}_x', f'{met}_y'
        h[xcol] = dat[:, 0]
        h[ycol] = dat[:, 1]

        title = hdr + ' ' + met

        seaborn_scatter(h, title, outpath, hue=hue, x=xcol, y=ycol, labels=labels, shape=shape, legend='brief')
Ejemplo n.º 15
0
def plot_lmconcs(flist, stack=False, test=False):
    """ plot lm concs with full output for all files in list, w/ optional joins"""
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    for f in flist:
        d, h = gct.extractgct(f)
        #ds, hs = gt.dsub(d, h, {'name':['5-Iodotubercidin', 'ERK5-IN-1']})
        outpath = gt.dflt_outpath(fldr_name='landmark concs')
        plottools.plot_landmark_concs(d,
                                      h,
                                      genes='all',
                                      labels='wells',
                                      outpath=outpath,
                                      test=test)
    if stack is True:
        imgs.bulk_stack(outpath, orient='vert', delim='_', idx=2)
Ejemplo n.º 16
0
Archivo: qc.py Proyecto: wrbutton/foo
def resize_qc(path='dflt'):
    """ run through folder contents and subdirs and conduct appropriate image resizing"""
    if path is 'dflt':
        path = gt.dflt_outpath(fldr_name='QCprocessing')
    fl = glob.glob(path + '/**/*.png', recursive=True)
    for f in fl:
        if 'calibplot' in f:
            calplot(f, f)
        elif '_es1' in f:
            escoresum(f, f)
        elif 'FLOGP' in f:
            flogp(f, f)
        elif 'euclidean' in f:
            euclidean(f, f)
        elif 'cellid' in f:
            cellid(f, f)
Ejemplo n.º 17
0
def make_dendrogram(df, labels='dflt', orient='top', outpath=True, trunc=False, res=False):
    """ uses scipy ward clustering to form dendrogram, trunc folds up last detailed splits """

    linked = linkage(df.T, 'ward')

    fig, ax = plt.subplots()

    if isinstance(labels, str) and labels == 'dflt':
        labels = df.columns

    if trunc is False:
        dend = dendrogram(linked,
                   orientation=orient,
                   labels=labels,
                   distance_sort='descending',
                   show_leaf_counts=True,
                   leaf_rotation=90)

    else:
        dend = dendrogram(linked,
                          orientation=orient,
                          labels=labels,
                          distance_sort='descending',
                          show_leaf_counts=True,
                          truncate_mode='lastp',
                          show_contracted=True,
                          leaf_rotation=90)

    name = gt.dflt_name(df)

    fig.suptitle(f'{name} n={len(df.columns)} dendrogram', fontweight='bold')

    plt.tight_layout()

    if outpath is True:
        outpath = gt.dflt_outpath(fn=name + '_dend.png')
        plt.savefig(outpath)
        plt.close()
    elif outpath != False:
        plt.savefig(outpath)
        plt.close()
    else:
        if res is False:
            return ax
        else:
            return dend
Ejemplo n.º 18
0
def predict_cells(input, save=False):
    """ can accept directory and loop through files or one dataframe at a time,
     uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction
     when save is True a dataframe will be saveh"""

    with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p',
              'rb') as file:
        clf = pickle.load(file)
    if isinstance(input, str):
        if os.path.isdir(input):
            vlist = gt.globit(input, '*_Qctrl_n*')
            if len(vlist) == 0:
                vlist = gt.globit(input, '*QNORM*')
        else:
            vlist = [input]
    elif isinstance(input, pd.Series):
        try:
            res = clf.predict([input])[0]
        except:
            print('error with series prediction')
            res = None
        return res
    else:
        vlist = input
    res_table = pd.DataFrame()
    for f in vlist:
        try:
            d, h = gct.extractgct(f)
        except:
            vlist[0] = d
            vlist[1] = h
        ds, hs = gt.dsub(d, h, {'type': 'vehicle'})
        if len(ds) == 0:
            print('error, maybe using ZS file? use QNORM instead')
            return None
        for b in hs.batch.unique():
            dsb, hsb = gt.dsub(ds, hs, {'batch': b})
            med = dsb.median(axis=1).values
            shn = gt.get_shn(f) + '-' + b
            res = clf.predict([med])[0]
            res_table.loc[shn, 'cell'] = res
            print(f'{shn} - {res}')
    if save is True:
        res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t')
    return res_table
Ejemplo n.º 19
0
Archivo: sigs.py Proyecto: wrbutton/foo
def save_sig(sig, gs=False, name='dflt', path='dflt'):
    """ pass a signature tuple of up/down and name/path to save those lists """
    if name is 'dflt':
        name = 'mysig'
        try:
            name = name.replace(':', '-')
        except AttributeError:
            pass
    if path is 'dflt':
        path = gt.dflt_outpath(fldr_name='foo')
    up, dn = sig[0], sig[1]
    if gs is True:
        up = convert_to_symbols(up)
        dn = convert_to_symbols(dn)
    filename = name + '_up.grp'
    gt.savelist(up, os.path.join(path, filename))
    filename = name + '_dn.grp'
    gt.savelist(dn, os.path.join(path, filename))
Ejemplo n.º 20
0
def plate_comparison(flist, scat=True, corr=True, dat=False):
    """ for a range of gct files consolidate median, mean, std, cv from each and then
     create pairwise scatterplots for each. flexible in number of plates """
    if isinstance(flist, str):
        flist = gt.splitpaths(flist, '.gct')
    outpath = gt.dflt_outpath(fldr_name='comparisons')
    ddict = cll.OrderedDict()
    hdict = cll.OrderedDict()
    for i, f in enumerate(flist):
        name = gt.get_shn(f)
        df, h = gct.extractgct(f)
        ddict[name], hdict[name] = df, h
        if i == 0:
            baseindex = df.index
    medians = pd.DataFrame(index=baseindex)
    medians.name = 'median gene values'
    stdev = pd.DataFrame(index=baseindex)
    stdev.name = 'gene standard deviations'
    cv = pd.DataFrame(index=baseindex)
    cv.name = 'gene coefficient of variation'
    average = pd.DataFrame(index=baseindex)
    average.name = 'gene average'
    for n, d in ddict.items():
        medians[n] = d.median(axis=1)
        stdev[n] = d.std(axis=1)
        cv[n] = d.std(axis=1) / d.mean(axis=1)
        average[n] = d.mean(axis=1)
    for dset in [medians, stdev, cv, average]:
        if scat is True:
            sns.pairplot(dset)
            plt.tight_layout()
            plt.suptitle(dset.name)
            plt.savefig(os.path.join(outpath, dset.name + 'scatter.png'))
            plt.close()
        if dat is True:
            dset.to_excel(os.path.join(outpath, dset.name + '.xlsx'))
        if corr is True:
            ax = plottools.plot_euclidean(dset, dset.columns)
            ax.set_title(dset.name)
            plt.tight_layout()
            plt.savefig(os.path.join(outpath, dset.name + 'matrix.png'))
            plt.close()
Ejemplo n.º 21
0
Archivo: sigs.py Proyecto: wrbutton/foo
def get_and_save_sig(inst, t, gs=False, name='dflt', path='dflt'):
    """ automatically save generated signature with default or provided file name
    and destination folder. gs flag saves things in terms of gene symbols """
    if name is 'dflt':
        name = inst.name
        try:
            name = name.replace(':', '-')
        except AttributeError:
            pass
    if path is 'dflt':
        path = gt.dflt_outpath(fldr_name='foo')
    up, dn = get_sig(inst, t)
    if gs is True:
        up = convert_to_symbols(up)
        dn = convert_to_symbols(dn)
    filename = name + '_up.grp'
    gt.savelist(up, os.path.join(path, filename))
    filename = name + '_dn.grp'
    gt.savelist(dn, os.path.join(path, filename))
    return (up, dn)
Ejemplo n.º 22
0
Archivo: gcsv.py Proyecto: wrbutton/foo
def summarize_csvs(path):
    """ provide path containing csv files to generate output summarizing levels 1 and 10
    for the plate as well as the posamp and ref """
    if path is None:
        path = gt.dflt_outpath(fldr_name='csv')
    results = cll.defaultdict(dict)
    f_list = gt.get_flist(path, '.csv')
    for file in f_list:
        try:
            c = Gcsv(file)
            d = c.build_dframe()
            results[c.shortname]['plate-L10'] = d['Analyte 10'].mean(axis=0)
            results[c.shortname]['Pos-L10'] = d.ix['B1']['Analyte 10']
            results[c.shortname]['Ref-L10'] = d.ix[['A2', 'B2'
                                                    ]]['Analyte 10'].mean()
            results[c.shortname]['plate-L1'] = d['Analyte 1'].mean(axis=0)
        except:
            print('error with ' + file)
    res = pd.DataFrame(results)
    res = res.T
    outpath = os.path.join(path, 'csv_summary.txt')
    res.to_csv(outpath, sep='\t', float_format='%.0f')
Ejemplo n.º 23
0
Archivo: qc.py Proyecto: wrbutton/foo
def combine_fails(path='dflt', ret=False, summ=False, sep=False, thresh=1):
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='newQC')
    fl = gt.globit(path, '*QC_fail*')
    files = ' '.join(fl)
    #cmd_str = 'cat ' + files + ' > ' + os.path.join(path, 'QC_fail.txt')
    #subprocess.run(cmd_str, shell=True)
    datlist = []
    for f in fl:
        dat = pd.read_csv(f, sep='\t', skiprows=1)
        dropcols = [x for x in dat.columns if 'Unnamed' in x]
        dat = dat.drop(dropcols, axis=1)
        dat.dropna(inplace=True)
        try:
            dat = dat[dat['Batch'] != ' ']
        except:
            pass
        if sep == False:
            try:
                dat = dat[dat['Batch'] != 'Batch']
            except:
                pass
        datlist. append(dat)
    data = pd.concat(datlist, axis=0)
    data.to_csv(os.path.join(path, 'QCfail_summary.txt'), sep='\t')

    if summ is True:
        gbname = data.groupby('PERT_DESC').size()
        print(gbname[gbname > thresh])
        gbbatch = data.groupby('Batch').size()
        print(gbbatch[gbbatch > thresh])

        # this subsets down to show how many doses totally fail (3 reps each) per name
        # g = f.groupby(['PERT_DESC', 'DOSE']).size()
        # res = g[g > 2].groupby('PERT_DESC').size().sort_values(ascending=False)

    if ret is True:
        return data
Ejemplo n.º 24
0
def plate_map_vis(myseries, cmap='dflt', path='dflt'):
    """ just translate directly into a dict or overwrite to use?
    returns just the array to plot """
    num_cats = len(myseries.unique())
    cat2num = dict(zip(myseries.unique(), range(num_cats)))
    data = myseries.apply(lambda x: cat2num[x])
    if cmap == 'dflt':
        if num_cats < 10:
            cmap = 'tab10'
            maxcats = 10
        else:
            camp = 'tab20'
            maxcats = 20
    if path is 'dflt':
        outpath = gt.dflt_outpath(fn=myseries.name)
    else:
        outpath = os.path.join(path, myseries.name)
    plottools.plot_plateplot(data,
                             outpath=outpath,
                             label=data.name,
                             ncats=maxcats,
                             cmap=cmap,
                             clrbar=cat2num)
Ejemplo n.º 25
0
Archivo: gt.py Proyecto: wrbutton/foo
def separate_subset_folders(path, mylist, down=False, dest='dflt'):
    """ copy top level folders over from path to destination if the folders match any terms in
        mylist, all folders transferred over as-is. if not found, printed """

    if dest is 'dflt':
        dest = gt.dflt_outpath()

    for st in mylist:
        if down is False:
            fl = glob.glob(path + st + '*')
        elif down is True:
            fl = glob.glob(path + '*/' + st)

        cplist = [x for x in fl if os.path.isdir(x)]

        try:
            dirpath = cplist[0]
            bn = os.path.basename(dirpath)
            try:
                shutil.copytree(dirpath, os.path.join(dest, bn))
            except FileExistsError:
                pass
        except IndexError:
            print(st, ' not found')
Ejemplo n.º 26
0
def plot_cohorts(vdict,
                 outpath='dflt',
                 mode='sep',
                 dtype='auto',
                 maxx='dflt',
                 title='foo',
                 label=True,
                 incr=1,
                 size=20):
    """ given a dictionary with name : [values], plot them all on the same plot but with
    different color (and optional size). Can be promiscuity values, or concentration range
    breakdown will give a dictionary in return, optionally
    dtype 'auto' will figure out either gct or zs, but can be specified """
    if outpath == 'dflt':
        outpath = gt.dflt_outpath()
    # set the color pallet and spacing/sizing levels (figsize tuned to these)
    cmap = plt.get_cmap('tab10')
    # parametetrs controlling the optional labels below each cohort
    txt_args = {'fontsize': 8, 'rotation': 90, 'fontweight': 'bold'}
    # set initial color counters and x starting position
    ci = 0
    x_pos = 1
    # is this duplicated with the stuff below?
    try:
        maxv = round(
            max([max(abs(max(v)), abs(min(v))) for v in vdict.values()]))
    except:
        try:
            maxv = round(max([max(abs(v)) for v in vdict.values()]))
        except:
            try:
                maxv = round(max([abs(v).max() for v in vdict.values()]))
            except:
                maxv = round(max([abs(v).max().max() for v in vdict.values()]))
    maxv += maxv * 0.1
    # calc x range with length of vector corrected by reps, plus spacing btwn
    if mode is 'sep':
        try:
            maxx = sum([len(x.columns) for x in vdict.values()]) * incr + incr
        except:
            maxx = sum([len(x) for x in vdict.values()]) * incr + incr
    elif mode is 'tog':
        maxx = len(vdict.keys()) * incr + incr
    # pull out the first value set to check for plot formatting
    for i, vals in enumerate(vdict.values()):
        if i >= 1:
            break
        myvals = vals
    # create and baseline format plot
    if dtype == 'auto':
        dtype = check_plottype(myvals)
    # adjust plot type if auto adjusted
    if dtype == 'zs' and maxv > 10:
        maxy = round(maxv + 1)
        ax = format_concentration_plot(maxx, ptype=dtype, maxy=maxy)
    else:
        ax = format_concentration_plot(maxx, ptype=dtype)
    ax.set_xlim([0, maxx + 1])
    ax.set_xlabel('')
    # determine title,
    if title is 'foo':
        try:
            title = myvals.name
        except:
            try:
                title = myvals.columns[0]
            except:
                title = 'foo'
    ax.set_title(title)

    if dtype == 'gct':
        y_label = min(ax.get_ylim()) * 0.75
    elif dtype == 'zs':
        y_label = min(ax.get_ylim()) * 1.2

    for n, vals in vdict.items():
        #try:
        #    vals = vals.values[0]
        #except:
        #    print('vals error...')
        #    pass
        # increment through colors in cmap
        color = cmap(ci)
        ci += 1
        if ci > 9:
            ci = 0
        # catch to handle empty values
        if len(vals) == 0:
            xlength = 1
            vals = []
        else:
            xlength = len(vals)
        # set x coordinates for values
        if mode is 'sep':
            x_vals = [x_pos + (x * incr) for x in range(xlength)]
            x_pos = max(x_vals) + incr
        elif mode is 'tog':
            x_vals = [x_pos] * xlength
            x_pos += incr
        # plot the current vals with specified color and size
        # implement catches in case no values for a given entry
        try:
            ax.scatter(x_vals, vals, color=color, s=size)
        except:
            ax.scatter(x_vals, [0], color='white', s=size)
        #print(vals)
        # then add label for each cohort below
        if label is True:
            if len(x_vals) > 1:
                x_label = (x_vals[0] + x_vals[-1]) / 2
            else:
                x_label = x_vals[0] - 1
            ax.text(x_label, y_label, n, color=color, **txt_args)
    #return ax
    plt.savefig(os.path.join(outpath, title + '.png'), bbox_inches='tight')
    plt.close()
Ejemplo n.º 27
0
def make_dotplot(vctr,
                 wdict=None,
                 title='dflt',
                 outpath='dflt',
                 legend=False,
                 width=5):
    """ passing in a series, label and well dictionary of highlighted cohorts with name: [wells] """
    if outpath == 'dflt':
        outpath = gt.dflt_outpath(fldr_name='output figs')
    cmap = plt.get_cmap('tab10')
    xrange = len(list(vctr))
    dtype = check_plottype(vctr.iloc[2])
    #print('dtype is ', dtype)
    ax = format_concentration_plot(xrange, ptype=dtype, width=width)
    # set additional title and axis
    ax.set_ylabel(vctr.name, fontsize=12)
    if title == 'dflt':
        title = vctr.index.values[0].split(':')[0] + ' - ' + vctr.name
    ax.set_title(title)
    awells = list(vctr.index)
    # plot primary data
    plt.plot(vctr.values,
             color='silver',
             marker='o',
             ls='',
             markersize=5,
             mew=0)
    ci = 0
    allwells = []
    [allwells.extend(w) for w in wdict.values()]
    if not any([w in vctr.index for w in allwells]):
        print('well dictionary not aligned, attempting patch')
        wdict2 = {}
        pname = vctr.index.values[0].split(':')[0]
        for name, wells in wdict.items():
            wdict2[name] = [pname + ':' + w for w in wells]
        wdict = wdict2
    if wdict is not None:
        mycolors, mynames = [], []
        for name, wells in wdict.items():
            color = cmap(ci)
            mycolors.append(color)
            mynames.append(name)
            ci += 1
            if ci > 9:
                ci = 0
            mywells = [x for x in wells if x in vctr.index]
            if len(mywells) == 0:
                print('no wells left')
            sety = vctr[mywells]
            setx = [awells.index(w) for w in mywells]
            plt.plot(setx,
                     sety,
                     color=color,
                     marker='o',
                     ls='',
                     markersize=5,
                     mew=0)
    if legend is not False:
        leg_dict = dict(zip(mynames, mycolors))
        # create a patch (proxy artist) for every color
        patches = [
            mpatches.Patch(color=mycolors[i], label=text)
            for i, text in enumerate(leg_dict.keys())
        ]
        # put those patched as legend-handles into the legend
        lgd = plt.legend(handles=patches,
                         bbox_to_anchor=(1.05, 1),
                         loc=2,
                         borderaxespad=0.)
    filename = title + '.png'
    plt.savefig(os.path.join(outpath, filename))
    plt.close()
Ejemplo n.º 28
0
def plot_landmark_concs(df,
                        h,
                        maxy=12,
                        cats='n',
                        labels='dflt',
                        genes='test100',
                        outpath='dflt',
                        title='dflt',
                        dosenum='dflt',
                        test=False):
    """ plot many or all landmarks, should pass in a subset dataframe and header which
    should be the consensus ZS file. can contain many different names + doses, will auto breakdown by 'nd'
    a single line per gene is plotted for the ZS across all concentrations
     labels can be 'dflt' for just incr numbers, or 'wells' for address, or 'dose' for numbers """
    # txt_args = {'fontsize': 8,
    #             'rotation': 90,
    #             'fontweight': 'bold'}

    if outpath is 'dflt':
        outpath = gt.dflt_outpath()
    df, h = gt.dsub(df, h, {'type': 'test'})
    names = h.name.dropna().unique()
    doses = gt.hsub(h, {'name': names[0]})['dose'].dropna().unique()
    if len(gt.hsub(h, {'name': names[0], 'dose': doses[0]})) > 1:
        print('dataframe not collapsed to consensus, bogus lm concs')
        print(gt.hsub(h, {'name': names[1], 'dose': doses[0]}).head())
    for ds, hs in pa.breakdown(df, h, cats, dic=False):
        #hs['dose'] = pd.to_numeric(hs['dose'])
        hs.sort_values('dose', ascending=True, inplace=True)
        ds = ds[hs.index]
        xrange = len(hs.dose.unique()) - 2
        ax = format_concentration_plot(xrange, maxy=maxy, width=4)
        ax.tick_params(axis='x', bottom='on', top='off', labelbottom='on')
        if dosenum == 'dflt':
            dose_range = range(len(hs.dose.unique()))
        else:
            dose_range = range(dosenum)
        ax.set_xticks(dose_range)
        if labels == 'dflt':
            ax.set_xticklabels([str(x + 1) for x in dose_range])
        elif labels == 'wells':
            # temporary labels
            ax.set_xticklabels(hs.index, rotation=45)
        elif labels == 'dose':
            ax.set_xticklabels(hs['dose'].unique(), rotation=45)
        else:
            try:
                ax.set_xticklabels(labels)
            except:
                print('problem with x range labels')

        # set title and name
        if title == 'dflt':
            try:
                mytitle = df.name
            except:
                mytitle = hs['plate'].values[0]
        mytitle = mytitle.strip('_sub')
        suffix = ''
        for c in cats:
            cat = gt.cats_lookup(c)
            attr = hs[cat].values[0][0]
            suffix += f' - {attr}'
        mytitle += suffix

        ax.set_title(mytitle, fontsize=14)
        for g in gt.get_genes(genes, df=df):
            data = ds.loc[g, :]
            ax.plot(data.values, linewidth=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(outpath, mytitle + '.png'))
        plt.close()
        if test is True:
            print('stopping after one iteration')
            break
Ejemplo n.º 29
0
def plot_correlation_matrix(df,
                            h,
                            ptype='corr',
                            title='dflt',
                            labels='dflt',
                            sort=False,
                            lower=0.25,
                            upper=1.0,
                            outpath=False,
                            cmap='dflt',
                            sparselabel=False,
                            grid=False):
    """ plots pearson correlation matrix between columns of the passed in dataframe. the labels can be used
     to sort the samples, sparselabel only prints one category label per section/cluster, and outpath will save
     to designated location, otherwise just display
     the 'lower' argument trims bottom of graph, so that there's less noise at the bottom end

     the sparselabel designation requires sorting, otherwise things work out funny
     _should improve label handling """

    fig, ax = plt.subplots()
    fig.set_size_inches(8.5, 8.5)

    if isinstance(labels, str) and labels == 'dflt':
        try:
            mylabels = h.label.values
        except:
            mylabels = df.columns
    if labels is None:
        mylabels = len(df.columns) * ['']
    else:
        mylabels = labels

    if sparselabel is True:
        sort = True

    if sort is True:
        print(f'{len(df.columns)} columns, {len(labels)} labels')
        keyd = dict(zip(df.columns, mylabels))
        neword = sorted(df.columns, key=lambda x: keyd[x])
        mylabels = [keyd[x] for x in neword]
        df = df[neword]

    if ptype == 'corr':
        corr = df.corr()
    elif ptype == 'euclid':
        cmap = 'rev'
        corr = get_euclidean(df, df=True)
        max = corr.max().max()
        upper = (1 - lower) * max
        lower = 0

    if lower is not None:
        corr = corr.clip(lower=lower)
    if upper is not None:
        corr = corr.clip(upper=upper)

    if cmap == 'dflt':
        cmap = blue_red_cmap()
    elif 'rev' in cmap:
        cmap = blue_red_cmap(['red', 'white', 'cornflowerblue'])

    cax = ax.imshow(corr, interpolation='nearest', cmap=cmap)
    #cbar = fig.colorbar(cax, ticks=[-1,0,1])

    if sparselabel is False:
        # x axis
        minor = np.arange(0.5, len(df.columns), 1)
        major = np.arange(0.5, len(df.columns), 1)
        ax.set_xticks(major, minor=False)
        ax.xaxis.set_tick_params(size=0)
        ax.set_xticks(minor, minor=True)
        # y axis
        minor = np.arange(0.5, len(df.columns), 1)
        major = np.arange(0, len(df.columns), 1)
        ax.set_yticks(major, minor=False)
        ax.yaxis.set_tick_params(size=0)
        ax.set_yticks(minor, minor=True)

        ax.set_xticklabels(mylabels, rotation=45, ha='right')
        ax.set_yticklabels(mylabels)

    elif sparselabel is True:
        unq_labels = sorted(list(set(mylabels)))
        cntr = cll.Counter(mylabels)
        ticks, mylabels, i, major, minor = [1], [], -.5, [], []
        for cat in unq_labels:
            chunk_size = cntr[cat]
            label_loc1 = i + chunk_size / 2
            label_loc2 = i + chunk_size
            #print(f'{cat} : len {cntr[cat]} at position {label_loc1}')
            major.append(label_loc1)
            minor.append(label_loc2)
            #ticks.extend([label_loc1, label_loc2])
            #mylabels.extend(['', cat])
            mylabels.append(cat)
            i += chunk_size

        ax.set_xticks(major, minor=False)
        ax.xaxis.set_tick_params(size=0)
        ax.set_xticks(minor, minor=True)
        ax.set_yticks(major, minor=False)
        ax.yaxis.set_tick_params(size=0)
        ax.set_yticks(minor, minor=True)

        ax.set_xticklabels(mylabels, rotation=45, ha='right')
        ax.set_yticklabels(mylabels)

        if grid is True:
            ax.grid(which='minor', axis='both', color='black')

    if title == 'dflt':
        try:
            title = df.name
        except AttributeError:
            title = df.columns[0].split(':')[0]

    ax.set_title(f'{title} - n={len(df.columns)} corr matrix', style='oblique')

    plt.tight_layout()

    if outpath is not False:
        if outpath is True:
            plt.savefig(gt.dflt_outpath(fn=title + '_corr.png'))
        else:
            if '.png' in outpath:
                plt.savefig(outpath)
            else:
                plt.savefig(outpath + '.png')
        plt.close()
Ejemplo n.º 30
0
def plot_plateplot(vctr,
                   name='dflt',
                   outpath='dflt',
                   label='dflt',
                   cmap='inferno',
                   ncats=None,
                   clrbar=True):
    """ will plot a 384 well plate with the values passed in the Series object vector, will 
    map to all wells and plot with auto-adjusted colors in the provided map with colorbar w/
    values if clrbar is True. Otherwise can pass dictionary into the clrbar variable to 
    plot a separate individual legend with keys as the name and values as the converted integer used
    to plot the map """
    if name == 'dflt':
        name = vctr.index.values[0].split(':')[0] + '-' + vctr.name
    # elif '_' not in name:
    #     name = name + ' - ' + vctr.name
    else:
        name = name
    if outpath == 'dflt':
        outpath = gt.dflt_outpath()
    if label == 'dflt':
        #label = vctr.name
        label = name
    fig, ax = plt.subplots()
    # set additional title and axis
    ax.set_title(label, y=1.1, fontsize=16)
    row_labels = list(string.ascii_uppercase[0:16])
    ax.set_yticks(list(np.arange(16)))
    ax.set_yticklabels(row_labels, fontsize=8)
    col_labels = list(np.arange(1, 25))
    ax.set_xticks(list(np.arange(0, 24)))
    ax.set_xticklabels(col_labels, fontsize=9)
    ax.tick_params(labelright=True, labeltop=True)
    # this sets the tick length to zero, but leaves labels
    plt.tick_params(axis=u'both', which=u'both', length=0)
    # reshape array and plot
    try:
        d = vctr.values.reshape(16, 24)
    except:
        print('error in reshape')
        return
    if ncats is not None:
        im = plt.imshow(d,
                        interpolation='nearest',
                        cmap=cmap,
                        vmin=0,
                        vmax=ncats)
    else:
        im = plt.imshow(d, interpolation='nearest', cmap=cmap)
    # use matplotlib axes1 to keep colorbars in line with figs
    if clrbar is True:
        divider = make_axes_locatable(ax)
        cax = divider.append_axes('right', size='5%', pad=0.3)
        cbar = plt.colorbar(im, cax=cax)
        cbar.ax.tick_params(labelsize=9)
        # simplify colorbar to 5 points including max/min
        mx, mn = vctr.max(), vctr.min()
        mid = (mx + mn) / 2
        svth = mid + ((mx - mid) / 2)
        twth = (mid - ((mx - mid) / 2))
        things = [mn, twth, mid, svth, mx]
        # things = [mn, mid, mx]
        thingsl = ['{:.1f}'.format(x) for x in things]
        cbar.set_ticks(things)
        cbar.set_ticklabels(thingsl)
    elif clrbar is not True:
        # get the colors of the values, according to the
        # colormap used by imshow
        leg_dict = clrbar
        colors = [im.cmap(im.norm(value)) for value in leg_dict.values()]
        # create a patch (proxy artist) for every color
        patches = [
            mpatches.Patch(color=colors[i], label=text)
            for i, text in enumerate(leg_dict.keys())
        ]
        # put those patched as legend-handles into the legend
        lgd = plt.legend(handles=patches,
                         bbox_to_anchor=(1.05, 1),
                         loc=2,
                         borderaxespad=0.)
    #plt.tight_layout()
    #if not outpath.endswith('.png'):
    #    outpath += '.png'
    outpath = os.path.join(outpath, name + '.png')
    try:
        fig.savefig(outpath, bbox_extra_artists=(lgd, ), bbox_inches='tight')
    except:
        plt.savefig(outpath)
    plt.close()