Beispiel #1
0
def plot_ase(gene, ase, expr=None, domain_lines=None):
    xs = get_xs(ase)
    cs = ['br'[c[0] == 'm'] for c in xs.index]
    print(cs)

    if gene not in ase.index:
        if gene in gns.index and gns[gene] in ase.index:
            gene = gns[gene]
        elif gene in fbgns.index and fbgns[gene] in ase.index:
            gene = fbgns[gene]
        else:
            raise KeyError("Gene {} not found!".format(gene))
    else:
        pass

    if expr is None:
        sizes = 20
    else:
        sizes = array(1.5 * log(expr.ix[gene])**2)
    assert len(cs) == len(xs)
    scatter(array(xs), array(ase.ix[gene]), c=cs, s=sizes)
    yt, ytn = yticks()
    yticks(yt, [ase_val(i) for i in yt])
    ylims = ylim()
    xlims = xlim()
    hlines(0, *xlims)
    xlim(*xlims)
    if domain_lines:
        vlines(domain_lines, *ylims)
        ylim(*ylims)
def fit_all_splines(expr, pool=None, progress=False):
    xs = get_xs(expr)
    is_good = (expr.isnull().sum() == 0)

    out = {}
    if progress:
        pb = pbar()
    else:
        pb = lambda x: x

    if pool is True:
        close = True
        pool = Pool()
    elif pool is None:
        for gene in pb(expr.index):
            expr_smooth = pd.rolling_mean(expr.ix[gene],
                                          3,
                                          center=True,
                                          min_periods=1)
            is_good = ~expr_smooth.isnull()
            out[gene] = interpolate.UnivariateSpline(xs[is_good],
                                                     expr_smooth[is_good])
        return out
    else:
        close = False

    asyncs = {}
    for gene in expr.index:
        expr_smooth = pd.rolling_mean(expr.ix[gene],
                                      3,
                                      center=True,
                                      min_periods=1)
        is_good = ~expr_smooth.isnull()
        asyncs[gene] = pool.apply_async(interpolate.UnivariateSpline,
                                        (xs[is_good], expr_smooth))

    for gene in pb(asyncs):
        res = asyncs[gene]
        out[gene] = res.get()
    if close:
        pool.close()
    return out
def calculate_spline_variance_explained(ase, splines, weights=None):
    xs = get_xs(ase)
    if weights is None:
        weights = np.ones_like(xs)
    if not callable(splines):
        var_obs = (((ase - splines) * weights)**2).sum()
        var_tot = (((ase - ase.mean()) * weights)**2).sum()
        return 1 - (var_obs / var_tot)
    elif hasattr(splines, 'index'):
        r2 = pd.Series(index=splines.index, data=np.inf)
    elif hasattr(splines, 'keys'):
        r2 = pd.Series(index=list(splines.keys()), data=np.inf)
    else:
        return 1 - (((ase - splines(xs))**2).sum() /
                    ((ase - ase.mean())**2).sum())
    for ix in r2.index:
        r2.ix[ix] = 1 - (((ase.ix[ix] - splines[ix](xs))**2).sum() /
                         ((ase.ix[ix] - ase.ix[ix].mean())**2).sum())

    return r2
Beispiel #4
0
    males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
    on_x = [chrom_of[gene] == 'X' if gene in chrom_of else False for gene in ase.index]
    is_male = [col.startswith(males) for col in ase.columns]
    ase.ix[on_x, is_male] = np.nan
    ase = ase.loc[ase.T.count() > len(ase.columns) / 2.0]

    hours = len(ase) / 1e4 * 1.5 + 2
    cluster_args['time'] = '{}:{}:00'.format(int(hours), int((hours % 1)*60))
    print("Estimate {} per iteration".format(cluster_args['time']))
    #cluster_args['queue'] = fyrd.Queue(user='******',
                                       #qtype=fyrd.queue.get_cluster_environment())
    print(cluster_args)
    sys.stdout.flush()


    xs = get_xs(ase)
    colnames = ['Amp', 'width', 'center', 'y_offset']
    peak_r2s = []
    logist_r2s = []

    n_perms = 1000
    waiting_jobs = Queue()
    active_jobs = Queue()
    for func, r2s in [(logistic, logist_r2s), (peak, peak_r2s)]:
        print('-'*30)
        print(func.__name__)
        print('-'*30, file=sys.stderr)
        print(func.__name__, file=sys.stderr)
        print('Building {} Jobs'.format(n_perms))
        sys.stdout.flush()
        sys.stderr.flush()
        #.select(zyg_genes.__contains__)
        #.select(similar_ase.__contains__)
    )
    both_expr = both_expr.index[both_expr]
    mel = mel.ix[both_expr]
    sim = sim.ix[both_expr]

    if 'mel_splines' not in locals() or locals().get('recalc', True):
        print("Fitting splines...")
        with Pool() as p:
            mel_splines = fit_all_splines(mel, p)
            sim_splines = fit_all_splines(sim, p)
        recalc = False
        redraw = True

    ase_xs = get_xs(ase)
    ase_maternals = pd.Series(
        index=ase_xs.index,
        data=[1 if col.startswith('simXmel') else -1 for col in ase_xs.index])
    ase_avgs = pd.DataFrame(
        data=dict(emd=np.nan,
                  exprclass='?',
                  actual=np.nan,
                  predicted=np.nan,
                  bias=np.nan,
                  n_good_slices=np.nan,
                  r2=np.nan,
                  rmsdiff=np.nan),
        index=mel.index,
    )
                       **pd_kwargs
                       )
           .dropna(how='all', axis=1)
           .dropna(how='all', axis=0)
           .select(**sel_startswith(('melXsim', 'simXmel')))
          )
    ase_limited = ase.select(**sel_startswith('melXsim'))
    chrom_of = get_chroms()

    males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
    on_x = chrom_of[ase.index] == 'X'
    is_male = [col.startswith(males) for col in ase.columns]
    ase.ix[on_x, is_male] = np.nan


    xs = get_xs(ase)
    xs_ltd = get_xs(ase_limited)
    colnames = ['Amp', 'width', 'center', 'y_offset']
    recalc_ase = locals().get('recalc_ase', True)
    if recalc_ase:
        with Pool() as p:
            res_logist = fit_all_ase(ase, logistic, xs, colnames, p,
                                     progress=True).dropna()
            res_logist_limited = fit_all_ase(ase_limited, logistic, xs_ltd, colnames, p,
                                             progress=True).dropna()


            res_peak = fit_all_ase(ase, peak, xs, colnames, p,
                                   progress=True).dropna()
            res_peak_limited = fit_all_ase(ase_limited, peak, xs_ltd, colnames, p,
                                           progress=True).dropna()