def plot_ase(gene, ase, expr=None, domain_lines=None): xs = get_xs(ase) cs = ['br'[c[0] == 'm'] for c in xs.index] print(cs) if gene not in ase.index: if gene in gns.index and gns[gene] in ase.index: gene = gns[gene] elif gene in fbgns.index and fbgns[gene] in ase.index: gene = fbgns[gene] else: raise KeyError("Gene {} not found!".format(gene)) else: pass if expr is None: sizes = 20 else: sizes = array(1.5 * log(expr.ix[gene])**2) assert len(cs) == len(xs) scatter(array(xs), array(ase.ix[gene]), c=cs, s=sizes) yt, ytn = yticks() yticks(yt, [ase_val(i) for i in yt]) ylims = ylim() xlims = xlim() hlines(0, *xlims) xlim(*xlims) if domain_lines: vlines(domain_lines, *ylims) ylim(*ylims)
def fit_all_splines(expr, pool=None, progress=False): xs = get_xs(expr) is_good = (expr.isnull().sum() == 0) out = {} if progress: pb = pbar() else: pb = lambda x: x if pool is True: close = True pool = Pool() elif pool is None: for gene in pb(expr.index): expr_smooth = pd.rolling_mean(expr.ix[gene], 3, center=True, min_periods=1) is_good = ~expr_smooth.isnull() out[gene] = interpolate.UnivariateSpline(xs[is_good], expr_smooth[is_good]) return out else: close = False asyncs = {} for gene in expr.index: expr_smooth = pd.rolling_mean(expr.ix[gene], 3, center=True, min_periods=1) is_good = ~expr_smooth.isnull() asyncs[gene] = pool.apply_async(interpolate.UnivariateSpline, (xs[is_good], expr_smooth)) for gene in pb(asyncs): res = asyncs[gene] out[gene] = res.get() if close: pool.close() return out
def calculate_spline_variance_explained(ase, splines, weights=None): xs = get_xs(ase) if weights is None: weights = np.ones_like(xs) if not callable(splines): var_obs = (((ase - splines) * weights)**2).sum() var_tot = (((ase - ase.mean()) * weights)**2).sum() return 1 - (var_obs / var_tot) elif hasattr(splines, 'index'): r2 = pd.Series(index=splines.index, data=np.inf) elif hasattr(splines, 'keys'): r2 = pd.Series(index=list(splines.keys()), data=np.inf) else: return 1 - (((ase - splines(xs))**2).sum() / ((ase - ase.mean())**2).sum()) for ix in r2.index: r2.ix[ix] = 1 - (((ase.ix[ix] - splines[ix](xs))**2).sum() / ((ase.ix[ix] - ase.ix[ix].mean())**2).sum()) return r2
males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') on_x = [chrom_of[gene] == 'X' if gene in chrom_of else False for gene in ase.index] is_male = [col.startswith(males) for col in ase.columns] ase.ix[on_x, is_male] = np.nan ase = ase.loc[ase.T.count() > len(ase.columns) / 2.0] hours = len(ase) / 1e4 * 1.5 + 2 cluster_args['time'] = '{}:{}:00'.format(int(hours), int((hours % 1)*60)) print("Estimate {} per iteration".format(cluster_args['time'])) #cluster_args['queue'] = fyrd.Queue(user='******', #qtype=fyrd.queue.get_cluster_environment()) print(cluster_args) sys.stdout.flush() xs = get_xs(ase) colnames = ['Amp', 'width', 'center', 'y_offset'] peak_r2s = [] logist_r2s = [] n_perms = 1000 waiting_jobs = Queue() active_jobs = Queue() for func, r2s in [(logistic, logist_r2s), (peak, peak_r2s)]: print('-'*30) print(func.__name__) print('-'*30, file=sys.stderr) print(func.__name__, file=sys.stderr) print('Building {} Jobs'.format(n_perms)) sys.stdout.flush() sys.stderr.flush()
#.select(zyg_genes.__contains__) #.select(similar_ase.__contains__) ) both_expr = both_expr.index[both_expr] mel = mel.ix[both_expr] sim = sim.ix[both_expr] if 'mel_splines' not in locals() or locals().get('recalc', True): print("Fitting splines...") with Pool() as p: mel_splines = fit_all_splines(mel, p) sim_splines = fit_all_splines(sim, p) recalc = False redraw = True ase_xs = get_xs(ase) ase_maternals = pd.Series( index=ase_xs.index, data=[1 if col.startswith('simXmel') else -1 for col in ase_xs.index]) ase_avgs = pd.DataFrame( data=dict(emd=np.nan, exprclass='?', actual=np.nan, predicted=np.nan, bias=np.nan, n_good_slices=np.nan, r2=np.nan, rmsdiff=np.nan), index=mel.index, )
**pd_kwargs ) .dropna(how='all', axis=1) .dropna(how='all', axis=0) .select(**sel_startswith(('melXsim', 'simXmel'))) ) ase_limited = ase.select(**sel_startswith('melXsim')) chrom_of = get_chroms() males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') on_x = chrom_of[ase.index] == 'X' is_male = [col.startswith(males) for col in ase.columns] ase.ix[on_x, is_male] = np.nan xs = get_xs(ase) xs_ltd = get_xs(ase_limited) colnames = ['Amp', 'width', 'center', 'y_offset'] recalc_ase = locals().get('recalc_ase', True) if recalc_ase: with Pool() as p: res_logist = fit_all_ase(ase, logistic, xs, colnames, p, progress=True).dropna() res_logist_limited = fit_all_ase(ase_limited, logistic, xs_ltd, colnames, p, progress=True).dropna() res_peak = fit_all_ase(ase, peak, xs, colnames, p, progress=True).dropna() res_peak_limited = fit_all_ase(ase_limited, peak, xs_ltd, colnames, p, progress=True).dropna()