Exemple #1
0
def get_s2n_genes(g, c1, c2):
    d, h = gct.extractgct(g)
    c1w = gt.hsub(h, {'well': c1}).index
    c2w = gt.hsub(h, {'well': c2}).index
    d1 = d[c1w]
    d2 = d[c2w]
    res = sig_to_noise(d1, d2)
    return res
Exemple #2
0
def ctup(df, arg_dict, col=':', u=True, lst=False):
    if lst is True and u is True:
        r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique()
    elif lst is False and u is True:
        r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique())
    elif lst is True and u is False:
        r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna()
    elif lst is False and u is False:
        r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna())
    return r
Exemple #3
0
def summarize_doses(h):
    # returns series of names and lists of their unique doses as values
    h.sort_values(['batch', 'name', 'dose'])
    try:
        res = gt.hsub(h, {
            'type': 'test'
        }).sort_values('dose')['dose'].groupby(h['name']).unique()
    except:
        res = gt.hsub(h, {'type': 'test'})['dilution'].groupby(
            h['name']).unique()
    return res
Exemple #4
0
def ctup(df, arg_dict, col=':', u=True, lst=False):
    """ count up, check to see how many entries in df satisfy the argdict """
    if lst is True and u is True:
        r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique()
    elif lst is False and u is True:
        r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna().unique())
    elif lst is True and u is False:
        r = gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna()
    elif lst is False and u is False:
        r = len(gt.hsub(df, arg_dict=arg_dict).loc[:, col].dropna())
    return r
Exemple #5
0
def get_well_reps(h, well, cats, df=False):
    """ return list of well addresses of other wells in the header file matching the
    provided categories as the passed well. n=name, d=dose, c=cell, b=batch
    returns list of addrs unless df is True, then it passes bach header dataframe of those wells"""
    args = []
    if len(well) == 3:
        well = h.index[0][:-3] + well
    if 'n' in cats:
        args.append('name')
    if 'd' in cats:
        args.append('dose')
    if 'b' in cats:
        args.append('batch')
    if 'c' in cats:
        args.append('cell')

    argdict = {}
    try:
        mywell = h.loc[well]
    except KeyError:
        print(f'{well} well not found in index')
        return 'empty'
    print(mywell)
    for a in args:
        argdict[a] = mywell[a]

    matches = gt.hsub(h, argdict)
    if df is True:
        return matches
    else:
        return list(matches.index.values)
Exemple #6
0
def check_data(path='dflt'):
    """ a better final map checker """
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='finaldata')

    flist = gt.get_flist(path, ext='.gct')
    maplist = gt.get_flist(path, ext='.txt')
    maplist.extend((gt.get_flist(path, ext='.xlsx')))

    for f in flist:
        shn = gt.get_shn(f).split('.')[0]
        try:
            mapfile = [x for x in maplist if shn in x][0]
        except:
            print(f'error with map file {shn}')

        g = gct.Gct(f)
        g.get_headers()
        g.get_wells()
        datwells = g.wells

        mymap = gct.extractmap(mapfile)

        mapwells = gt.hsub(mymap, {'type':['vehicle', 'poscon', 'test']})['well'].values

        res = set(datwells) - set(mapwells)

        if len(res) == 0:
            print(f'{shn} ok, {380-len(datwells)} failed wells')
        else:
            print(f'eror with map/data {shn}, {len(datwells)}/{len(mapwells)}')
Exemple #7
0
def make_barview_range(edf,
                       argdict,
                       across='dose',
                       label=False,
                       outpath=False):
    """ with enrichment score results, plot barviews across the range of conditions, default dose """

    cond_range = sorted(gt.hsub(edf, argdict)[across].unique())

    print(argdict.values())
    mytitle = ' '.join(argdict.values())

    fig, axarr = plt.subplots(1, len(cond_range), sharey='row')

    for i, cond in enumerate(cond_range):
        my_ax = axarr[i]
        new_argdict = argdict
        if across is not None:
            new_argdict[across] = cond
        if label is True:
            make_barview(edf, new_argdict, ax=my_ax, label=cond)
        else:
            make_barview(edf, new_argdict, ax=my_ax)

    #fig.subplots_adjust(hspace=0.5)

    plt.suptitle(mytitle)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    if outpath is True:
        outpath = gt.dflt_outpath(fldr_name='foo')
        myoutpath = os.path.join(outpath, mytitle + '_enrich.png')
        plt.savefig(myoutpath)
        plt.close()
Exemple #8
0
def batch_summary(file):
    """ summarizes identities of each batch in a plate map, one well per batch  """
    m = gct.openmap(file)
    batches = m['batch'].dropna().unique()
    res = []
    for b in batches:
        res.append(gt.hsub(m, {'batch': b}).iloc[3])
    res = pd.concat(res, axis=1)
    return res.T
Exemple #9
0
def make_barview(edf, argdict, ax=None, label=False, height=2):
    """ given an edf header file with enrichment info, and an argument dictionary for highlighted
    intances within the barview. passed label will be on left hand side

    the height of the highlighted instances can be sensitive to being swaamped out and invisible

    """

    my_ax = ax

    ax = format_barview_plot(edf, ax=my_ax)

    if label is True:
        ax.set_ylabel(list(argdict.values())[0], labelpad=0.0)
    elif label is not False:
        ax.set_ylabel(label, labelpad=0.0)

    edf.sort_values(['scaled', 'up'], ascending=False, inplace=True)

    edf['plot_pos'] = list(range(len(edf) + 2, 2, -1))
    edf['rank'] = list(range(1, len(edf) + 1, 1))

    pos = edf[edf['scaled'] > 0]
    null = edf[edf['scaled'] == 0]
    neg = edf[edf['scaled'] < 0]
    selected = gt.hsub(edf, argdict)

    sslist = [pos, null, neg, selected]
    clist = ['lime', 'lightgrey', 'red', 'black']

    # height of the highlight bar can be sensitive
    for subset, color in zip(sslist, clist):
        ax.barh(subset['plot_pos'], [1 * len(subset)],
                color=color,
                align='center',
                height=height)

    if my_ax is None:
        plt.tight_layout()
Exemple #10
0
def plot_concentrations(df,
                        h,
                        genes='test2',
                        label=False,
                        mode='ind',
                        incr='dflt',
                        outpath='dflt',
                        fn='dflt',
                        maxx='dflt',
                        test=False):
    """ plotting concentration plots on a per-gene basis from a df/header
    outpath for figures is optionally specified, genes can be passed in as
    a list, left as 'test' for a single gene, or be 'all'.
    the mode= ind,med,avg will either plot individual reps w/ same x value or combine
    reps together using either 'med' median or 'avg' average across reps

    assumes broken down by name and dose, and only within one batch
    """
    # parametetrs controlling the optional labels below each cohort
    txt_args = {'fontsize': 8, 'rotation': 90, 'fontweight': 'bold'}
    genes = gt.get_genes(genes, df=df)
    # define outpath directory, create if necessary
    if outpath is 'dflt':
        outpath = os.path.join(gt.check_desktop(), 'output_figs')
    try:
        os.mkdir(outpath)
    except:
        pass
    # define title
    if fn is not 'dflt':
        name = fn
    else:
        try:
            name = df.name
        except AttributeError:
            name = h.index[0].split(':')[0]
    # set the color pallet and spacing/sizing levels (figsize tuned to these)
    cmap = plt.get_cmap('tab10')
    if incr == 'dflt':
        incr = 10
    sincr = 20

    # sort the sample wells in desired order, by name and dose for test
    d, h = gt.dsub(df, h, {'type': 'test'})
    df = d

    # create pert list for plot, strip batch if there's only one batch
    pert_list = []
    print(h['name'].unique())
    for n in h['name'].unique():
        pert_list.append('{}'.format(n))

    # if there are multiple reps adjust figure width to account
    # reps is for each name and dose combo, how many are there?
    #num_reps = round(h.groupby('name')['dose'].nunique().mean())
    ndoses = h.groupby('name')['dose'].nunique().max()
    nnames = h.name.nunique()
    print(name)
    print('num doses ', ndoses)
    print('name list ', len(pert_list))

    if isinstance(genes, str):
        genes = [genes]

    if maxx == 'dflt':
        # calc x range with length of vector corrected by reps, plus spacing btwn
        # basewidth = (len(d.iloc[0]) / num_reps) * incr
        # pert_buffer = (len(pert_list)) * 1 * incr
        pad = 8 * incr
        # maxx = basewidth + pert_buffer + pad
        maxx = (incr * nnames * ndoses) + (incr * 2 * nnames)

    for g in genes:
        # set initial color counters and x starting position
        ci = 0
        x_pos = 15
        # select vector for current gene
        dat = df.loc[g]
        # determine the max range of x axis
        maxv = round(max(abs(dat))) + 3
        ax = format_concentration_plot(maxx, maxy=maxv)
        ax.set_ylabel(g)
        mytitle = name + ' - ' + g
        print(mytitle)
        ax.set_title(mytitle)
        x_init = 0
        names = h['name'].apply(lambda x: str(x)).unique()
        for n in names:
            # increment through colors in cmap
            color = cmap(ci)
            ci += .1
            if ci > .9:
                ci = 0
            sub = h[h['name'] == n]
            doses = sorted(sub['dose'].unique(), key=lambda x: float(x))
            sizes = [(x + 1) * sincr for x in range(len(doses))]
            for d, s in zip(doses, sizes):
                args = {'name': n, 'dose': d}
                wids = gt.hsub(h, args).index.values
                y_vals = dat[wids].values
                if mode == 'avg':
                    y_vals = np.mean(y_vals)
                if mode == 'med':
                    y_vals = np.median(y_vals)
                try:
                    x_vals = [x_pos] * len(y_vals)
                except TypeError:
                    x_vals = x_pos
                # plot the current vals with specified color and size
                ax.scatter(x_vals, y_vals, c=color, s=s)
                x_pos += incr
            # put spacing between perts
            if label is True:
                # n = ' '.join([n, d])
                x_label = (x_init + x_pos) / 2
                ax.text(x_label, -(maxv + 1), n, color=color, **txt_args)
            x_pos += (incr * 2)
            x_init = x_pos
        plt.savefig(os.path.join(outpath, mytitle + '.png'),
                    bbox_inches='tight')
        plt.close()
        if test is True:
            print('test mode, exiting after one image')
            break
Exemple #11
0
def plot_landmark_concs(df,
                        h,
                        maxy=12,
                        cats='n',
                        labels='dflt',
                        genes='test100',
                        outpath='dflt',
                        title='dflt',
                        dosenum='dflt',
                        test=False):
    """ plot many or all landmarks, should pass in a subset dataframe and header which
    should be the consensus ZS file. can contain many different names + doses, will auto breakdown by 'nd'
    a single line per gene is plotted for the ZS across all concentrations
     labels can be 'dflt' for just incr numbers, or 'wells' for address, or 'dose' for numbers """
    # txt_args = {'fontsize': 8,
    #             'rotation': 90,
    #             'fontweight': 'bold'}

    if outpath is 'dflt':
        outpath = gt.dflt_outpath()
    df, h = gt.dsub(df, h, {'type': 'test'})
    names = h.name.dropna().unique()
    doses = gt.hsub(h, {'name': names[0]})['dose'].dropna().unique()
    if len(gt.hsub(h, {'name': names[0], 'dose': doses[0]})) > 1:
        print('dataframe not collapsed to consensus, bogus lm concs')
        print(gt.hsub(h, {'name': names[1], 'dose': doses[0]}).head())
    for ds, hs in pa.breakdown(df, h, cats, dic=False):
        #hs['dose'] = pd.to_numeric(hs['dose'])
        hs.sort_values('dose', ascending=True, inplace=True)
        ds = ds[hs.index]
        xrange = len(hs.dose.unique()) - 2
        ax = format_concentration_plot(xrange, maxy=maxy, width=4)
        ax.tick_params(axis='x', bottom='on', top='off', labelbottom='on')
        if dosenum == 'dflt':
            dose_range = range(len(hs.dose.unique()))
        else:
            dose_range = range(dosenum)
        ax.set_xticks(dose_range)
        if labels == 'dflt':
            ax.set_xticklabels([str(x + 1) for x in dose_range])
        elif labels == 'wells':
            # temporary labels
            ax.set_xticklabels(hs.index, rotation=45)
        elif labels == 'dose':
            ax.set_xticklabels(hs['dose'].unique(), rotation=45)
        else:
            try:
                ax.set_xticklabels(labels)
            except:
                print('problem with x range labels')

        # set title and name
        if title == 'dflt':
            try:
                mytitle = df.name
            except:
                mytitle = hs['plate'].values[0]
        mytitle = mytitle.strip('_sub')
        suffix = ''
        for c in cats:
            cat = gt.cats_lookup(c)
            attr = hs[cat].values[0][0]
            suffix += f' - {attr}'
        mytitle += suffix

        ax.set_title(mytitle, fontsize=14)
        for g in gt.get_genes(genes, df=df):
            data = ds.loc[g, :]
            ax.plot(data.values, linewidth=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(outpath, mytitle + '.png'))
        plt.close()
        if test is True:
            print('stopping after one iteration')
            break
Exemple #12
0
def summarize_doses(h):
    # returns series of names and lists of their unique doses as values
    res = gt.hsub(h, {'type': 'test'})['dilution'].groupby(h['name']).unique()
    return res
Exemple #13
0
def check_maps(path, compare=True, img=True, v=True, filt=True):
    """ looks through .txt and .xlsx maps in a directory and summarizes their content and relationship with each other,
    as well as generating plate visualizations of type and batch for each plate. V for verbose, lists names and doses per plate
     if filter is true, just observe 6character name excel files, otherwise consider all files"""

    # plot_fields = ['type', 'batch']
    plot_fields = ['type']
    # add checks to add batch and dose if present

    pert_dict, map_list = {}, {}
    wellpert_dict = {}

    flist = gt.get_flist(path, ext='.xlsx')
    if filt is True:
        flist = [x for x in flist if len(os.path.split(x)[-1]) == 11]
    if len(flist) == 0:
        flist = gt.get_flist(path, ext='.txt')
        flist = [x for x in flist if len(os.path.split(x)[-1]) == 10]

    if v is True:
        print('flist = ', flist)

    awells = gt.get_awells()
    composition = pd.DataFrame(columns=[
        'wells #', 'test #', 'doses', 'dose/trt', '# names', 'vehicle #',
        'poscon #', 'poscons'
    ])

    for f in flist:
        pname = gt.get_shn(f).split('.')[0]
        print(pname)
        if f.endswith('.xlsx'):
            m = pd.read_excel(f)
        elif f.endswith('.txt'):
            m = pd.read_table(f, index_col=False)
        m.sort_index(inplace=True)
        batches = m['batch'].dropna().unique()

        if any([('dose' in x) or ('dilution' in x) for x in m.columns]):
            dose_field = [
                x for x in m.columns if (('dose' in x) or ('dilution' in x))
            ][0]
        else:
            dose_field = None

        headers = {
            'wells #': lambda x: len(x.index),
            'test #': lambda x: ctup(x, {'type': 'test'}, 'well')
        }
        if dose_field is not None:
            headers.update({
                'doses':
                lambda x: ctup(x, {'type': 'test'}, dose_field),
                'dose/trt':
                lambda x: gt.hsub(m, {'type': 'test'})[dose_field].groupby(m[
                    'name']).unique().apply(lambda x: len(x)).mean()
            })
        elif dose_field is None:
            headers.update({'doses': 'na', 'dose/trt': 'na'})
        headers.update({
            '# names':
            lambda x: ctup(x, {'type': 'test'}, 'name'),
            'vehicle #':
            lambda x: ctup(x, {'type': 'vehicle'}, 'well'),
            'poscon #':
            lambda x: ctup(x, {'type': 'poscon'}, 'well'),
            'poscons':
            lambda x: ctup(x, {'type': 'poscon'}, 'name', lst=True)
        })

        summary = pd.DataFrame(columns=headers)

        # check wells for full plate
        well_result = set(awells) - set(m['well'].values)

        if len(well_result) != 0:
            print('{} wells error, {} entries'.format(pname, len(m.index)))

        if v is True:
            print(gt.hsub(m, {'type': 'test'})['name'].dropna().unique())
            try:
                doselist = gt.hsub(
                    m, {'type': 'test'})[dose_field].dropna().unique()
                print(doselist)
            except:
                print('error with dose col, ', dose_field)
                pass

        # summarize the header info per batch, and assemble pert-lists
        # for the overlap comparisons
        for b in batches:
            entry = pname + '-' + b
            ms = gt.hsub(m, {'batch': b})
            # gather pert names for overlap comparison
            pert_dict[entry] = ctup(m, {
                'batch': b,
                'type': 'test'
            },
                                    'name',
                                    lst=True)
            # get the well-pert identities for same plate comparison
            ms.loc[:, 'addr'] = ms['well'] + '-' + ms['name'].apply(
                lambda x: str(x))
            wellpert_dict[entry] = ms['addr'].values
            for k in headers.keys():
                try:
                    summary.loc[entry, k] = headers[k](ms)
                except (KeyError, TypeError):
                    summary.loc[entry, k] = 'na'

        composition = pd.concat([composition, summary])

        if img is True:
            for pf in plot_fields:
                plot_series = m[pf]
                if len(plot_series.dropna().unique()) > 1:
                    plot_series.name = pname + ' ' + pf
                    plate_map_vis(plot_series, path=path)

    composition.to_excel(os.path.join(path, 'batch_composition.xlsx'))

    if compare is True:
        same_plates = gt.overlap_matrix(wellpert_dict.values(),
                                        wellpert_dict.keys())
        name_overlap = gt.overlap_matrix(pert_dict.values(), pert_dict.keys())
        name_overlap.to_excel(os.path.join(path, 'name_overlaps.xlsx'))
        same_plates.to_excel(os.path.join(path, 'well-name_overlaps.xlsx'))
Exemple #14
0
def get_zscore(fpath, save=True, my_mad=None):
    """ merged from separate zscore file. can either save the resulting file or return data
    the first fpath argument can be a file path or a [d, h] object already"""
    # basic setup
    if isinstance(fpath, str):
        g = gct.Gct(fpath)
        g.get_headers()
        df, h = gct.extractgct(fpath)
    else:
        try:
            df = fpath[0]
            h = fpath[1]
        except:
            print('error with path')

    zsd = cll.defaultdict(dict)
    pname = gt.get_shn(fpath)

    for b in h['batch'].dropna().unique():
        if b == 'na':
            continue
        print('running zscore for {} batch {}'.format(pname, b))
        vw = gt.hsub(h, {'batch': b, 'type': 'vehicle'}).index.values
        if len(vw) == 0:
            break
        veh = df[vw]
        # get median value across vehicle populations
        med = veh.median(axis=1)

        # populate the absolute deviation values per gene
        ad = cll.defaultdict(list)
        for v in veh.columns:
            for f in veh.index:
                ad[f].append(abs(med[f] - veh[v][f]))
        # assemble the median absolute value per gene
        mad = {}
        for k, v in ad.items():
            r = statistics.median(v)
            if 0 < r < 0.1:
                r = 0.1
            mad[k] = r
        # using the above progress though test and poscon wells
        # to calculate sample zscores
        tw = list(h[(h['batch'] == b) & (h['type'] == 'test')].index.values)
        pw = list(h[(h['batch'] == b) & (h['type'] == 'poscon')].index.values)
        wells = tw + pw
        for w in df[wells].columns:
            for feat in df.index:
                if my_mad is not None and mad[feat] < my_mad:
                    zs = (df[w][feat] - med[feat]) / (my_mad * 1.486)
                elif mad[feat] == 0:
                    zs = 0
                else:
                    zs = (df[w][feat] - med[feat]) / (mad[feat] * 1.486)
                zsd[w][feat] = '{0:.3f}'.format(zs)

    # transform into dataframe, set index, null nonsense
    zsdf = pd.DataFrame(zsd)
    hs = h.loc[zsdf.columns]
    zsdf = zsdf.replace(['inf', '-inf'], np.nan).fillna('nan')
    if save is True:
        outpath = '{}_ZS.gct'.format(fpath.split('_', 1)[0])
        gct.save_headergct(zsdf, hs, outpath)
    else:
        return zsdf, hs
Exemple #15
0
def breakdown(df, h, cats, dic=True, genes=None):
    """ takes a dataframe and header and the categories to break down by 'b' batch, 'c' cell, 'n' name, 'd' dose.
    returns a dictionary with the key as the description and the dataframe as the value.
    'w' is also supported as breakdown by well - useful for many plates with identical layout

    if dic is True a dictionary is returned, with a key title and dataframe value
    if dic is False then list is returned, of tuples with dataframe and header

    """

    if genes is not None:
        genes = gt.get_genes(genes)
        df = df.loc[genes]

    if 'd' in cats:
        try:
            dose_col = [
                x for x in h.columns if 'dose' in x or 'dilution' in x
            ][0]
        except IndexError:
            print('dose column error')
    else:
        dose_col = None

    vd = cll.OrderedDict()
    subs = []

    cd = {
        'c': 'cell',
        'b': 'batch',
        'd': dose_col,
        'n': 'name',
        'w': 'well',
        'p': 'plate'
    }

    clist = []

    for c in cats:
        try:
            clist.append(cd[c])
        except IndexError:
            print('error, more than 3 categories')

    cat1 = clist[0]
    group1 = sorted(h[cat1].dropna().unique())
    for e1 in group1:
        argdict = {cat1: e1}
        try:
            cat2 = clist[1]
            for e2 in sorted(gt.hsub(h, {cat1: e1})[cat2].dropna().unique()):
                argdict.update({cat2: e2})
                try:
                    cat3 = clist[2]
                    for e3 in sorted(
                            gt.hsub(h, {
                                cat1: e1,
                                cat2: e2
                            })[cat3].dropna().unique()):
                        argdict.update({cat3: e3})
                        hdr = f'{e1}-{e2}-{e3}'
                        if dic is True:
                            vd.update(
                                {hdr: gt.dosub(df, h, argdict, name=hdr)})
                        else:
                            subs.append(gt.dsub(df, h, argdict, name=hdr))
                except IndexError:
                    hdr = f'{e1}-{e2}'
                    if dic is True:
                        vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)})
                    else:
                        subs.append(gt.dsub(df, h, argdict, name=hdr))
        except IndexError:
            hdr = f'{e1}'
            if dic is True:
                vd.update({hdr: gt.dosub(df, h, argdict, name=hdr)})
            else:
                subs.append(gt.dsub(df, h, argdict, name=hdr))

    if dic is True:
        return vd
    else:
        return subs