def _add_scores(dataset,dataset_fits): for (g,r),fit in dataset_fits.iteritems(): if g is None: continue # it's a region fit series = dataset.get_one_series(g,r) try: if fit.fit_predictions is None: fit.fit_score = None else: fit.fit_score = cfg.score(series.single_expression, fit.fit_predictions) except: fit.fit_score = None try: fit.LOO_score = loo_score(series.single_expression, fit.LOO_predictions) except: fit.LOO_score = None # add score for correlation LOO fits correlation_levels = getattr(fit, 'with_correlations', None) if correlation_levels is not None: for level in correlation_levels: y_real = series.single_expression y_pred = level.LOO_predictions[series.original_inds] # match the predictions to the indices of the single series after NaN are removed from it level.LOO_score = loo_score(y_real, y_pred) return dataset_fits
def analyze_one_region(data, fitter, fits, region): print 'Analyzing region {}...'.format(region) series = data.get_several_series(data.gene_names, region) ds_fits = fits[data.get_dataset_for_region(region)] y = series.expression R2_tuples = {} for i, g in enumerate(series.gene_names): fit = ds_fits[(g, region)] y_real = y[:, i] y_basic = fit.LOO_predictions basic_R2 = loo_score(y_real, y_basic) scores = [basic_R2] for level in fit.with_correlations: y_multi_gene = level.LOO_predictions[series.original_inds] R2 = loo_score(y_real, y_multi_gene) scores.append(R2) if (np.array(scores) < -1).any(): continue R2_tuples[(g, region)] = tuple(scores) region_fits = ds_fits[(None, region)] correlations = region_fits[ 0].correlations # get correlations after one optimization iteration return R2_tuples, correlations
def _add_scores(dataset, dataset_fits): for (g, r), fit in dataset_fits.iteritems(): if g is None: continue # it's a region fit series = dataset.get_one_series(g, r) try: if fit.fit_predictions is None: fit.fit_score = None else: fit.fit_score = cfg.score(series.single_expression, fit.fit_predictions) except: fit.fit_score = None try: fit.LOO_score = loo_score(series.single_expression, fit.LOO_predictions) except: fit.LOO_score = None # add score for correlation LOO fits correlation_levels = getattr(fit, 'with_correlations', None) if correlation_levels is not None: for level in correlation_levels: y_real = series.single_expression y_pred = level.LOO_predictions[ series. original_inds] # match the predictions to the indices of the single series after NaN are removed from it level.LOO_score = loo_score(y_real, y_pred) return dataset_fits
def plot_one_series(series, shape=None, theta=None, LOO_predictions=None, change_distribution=None, minimal_annotations=False, ax=None, show_legend=True): x = series.ages y = series.single_expression b_subplot = ax is not None if ax is None: fig = plt.figure() ax = fig.add_subplot(111) fontsize = cfg.minimal_annotation_fontsize if minimal_annotations else cfg.fontsize # plot the data points markersize = 8 if not minimal_annotations else 4 ax.plot(series.ages, y, 'ks', markersize=markersize) if not b_subplot: ax.set_ylabel('expression level', fontsize=fontsize) ax.set_xlabel('age', fontsize=fontsize) ttl = '{}@{}'.format(series.gene_name, series.region_name) add_age_ticks(ax, series.age_scaler, fontsize) # plot change distribution if provided if change_distribution: ymin, ymax = ax.get_ylim() centers = change_distribution.centers width = centers[1] - centers[0] weights = change_distribution.weights weights *= 0.9 * (ymax - ymin) / weights.max() ax.bar(centers, weights, width=width, bottom=ymin, color='g', alpha=0.5) if shape is not None and theta is not None: # add fit parameters to title ttl = '{}, {} fit'.format(ttl, shape) more_ttl = shape.format_params(theta, series.age_scaler, latex=True) if more_ttl: ttl = '\n'.join([ttl, more_ttl]) # draw the overall fit score = cfg.score(y,shape.f(theta,x)) x_smooth,y_smooth = shape.high_res_preds(theta,x) label = 'fit ({}={:.3g})'.format(cfg.score_type, score) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label) # draw LOO predictions and residuals if LOO_predictions is not None: score = loo_score(y,LOO_predictions) for i,(xi,yi,y_loo) in enumerate(zip(x,y,LOO_predictions)): if y_loo is None or np.isnan(y_loo): continue label = 'LOO ({}={:.3g})'.format(cfg.score_type, score) if i==0 else None ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label) ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) if show_legend and not minimal_annotations: ax.legend(fontsize=fontsize, frameon=False) if not minimal_annotations: ax.tick_params(axis='y', labelsize=fontsize) if not b_subplot: ax.set_title(ttl, fontsize=fontsize) return ax.figure
def plot_one_exon(series, shape=None, theta=None, LOO_predictions=None, ax=None, y_range=None): x = series.ages y = series.single_expression fontsize = cfg.minimal_annotation_fontsize markersize = 8 y_scaler = scalers.build_scaler(cfg.plots_scaling, None) scaled = y_scaler is not None y_scaled = y_scaler.scale(y) if scaled else y if scaled and y_range is not None: y_range = y_scaler.scale(y_range) if y_range is not None: plt.ylim(y_range) ax.plot(series.ages, y_scaled, 'ks', markersize=markersize) ax.set_xlabel('age', fontsize=fontsize) add_age_ticks(ax, series.age_scaler, fontsize) exon = series.gene_name[series.gene_name.index(cfg.exon_separator) + 1:] ax.set_title(exon.replace(cfg.exon_separator, '-'), fontsize=14) if shape is not None and theta is not None: score = cfg.score(y, shape.f(theta, x)) x_smooth, y_smooth = shape.high_res_preds(theta, x) if scaled: y_smooth = y_scaler.scale(y_smooth) label = 'fit ({}={:.3g})'.format(cfg.score_type, score) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label) # draw LOO predictions and residuals if LOO_predictions is not None: score = loo_score(y, LOO_predictions) if scaled: LOO_predictions = y_scaler.scale(LOO_predictions) for i, (xi, yi, y_loo) in enumerate(zip(x, y_scaled, LOO_predictions)): if y_loo is None or np.isnan(y_loo): continue label = 'LOO ({}={:.3g})'.format( cfg.score_type, score) if i == 0 and score is not None else None ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label) ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) ax.legend(fontsize=fontsize, frameon=False) return ax.figure
def analyze_one_region(data, fitter, fits, region): print 'Analyzing region {}...'.format(region) series = data.get_several_series(data.gene_names,region) ds_fits = fits[data.get_dataset_for_region(region)] y = series.expression R2_tuples = {} for i,g in enumerate(series.gene_names): fit = ds_fits[(g,region)] y_real = y[:,i] y_basic = fit.LOO_predictions basic_R2 = loo_score(y_real,y_basic) scores = [basic_R2] for level in fit.with_correlations: y_multi_gene = level.LOO_predictions[series.original_inds] R2 = loo_score(y_real,y_multi_gene) scores.append(R2) if (np.array(scores) < -1).any(): continue R2_tuples[(g,region)] = tuple(scores) region_fits = ds_fits[(None,region)] correlations = region_fits[0].correlations # get correlations after one optimization iteration return R2_tuples, correlations
for i, g in enumerate(series.gene_names): print 'Fitting series {}...'.format(i + 1) theta, sigma, LOO_predictions, _ = fitter.fit(x, y[:, i], loo=True) fit = Bunch( theta=theta, LOO_predictions=LOO_predictions, ) fits.append(fit) print 'Fitting with correlations...' levels = fitter.fit_multi(x, y, loo=True, n_iterations=2) res = levels[-1] print 'Theta:' for ti in res.theta: print ' {}'.format(ti) print 'Sigma:' print res.sigma plot_series(series, fitter.shape, res.theta, res.LOO_predictions) R2_pairs = [] for i, g in enumerate(series.gene_names): y_real = y[:, i] y_basic = fits[i].LOO_predictions y_multi_gene = res.LOO_predictions[:, i] # no NANs in the generated data, so no need to handle the original_inds mess basic_R2 = loo_score(y_real, y_basic) multi_gene_R2 = loo_score(y_real, y_multi_gene) R2_pairs.append((basic_R2, multi_gene_R2)) plot_comparison_scatter(R2_pairs, series.region_name) print 'R2_pairs = {}'.format(R2_pairs)
def plot_one_series(series, shape=None, theta=None, LOO_predictions=None, change_distribution=None, minimal_annotations=False, ax=None, show_legend=True): x = series.ages y = series.single_expression b_subplot = ax is not None if ax is None: fig = plt.figure() ax = fig.add_subplot(111) fontsize = cfg.minimal_annotation_fontsize if minimal_annotations else cfg.fontsize # plot the data points markersize = 8 if not minimal_annotations else 4 ax.plot(series.ages, y, 'ks', markersize=markersize) if not b_subplot: ax.set_ylabel('expression level', fontsize=fontsize) ax.set_xlabel('age', fontsize=fontsize) ttl = '{}@{}'.format(series.gene_name, series.region_name) add_age_ticks(ax, series.age_scaler, fontsize) # plot change distribution if provided if change_distribution: ymin, ymax = ax.get_ylim() centers = change_distribution.centers width = centers[1] - centers[0] weights = change_distribution.weights weights *= 0.9 * (ymax - ymin) / weights.max() ax.bar(centers, weights, width=width, bottom=ymin, color='g', alpha=0.5) if shape is not None and theta is not None: # add fit parameters to title ttl = '{}, {} fit'.format(ttl, shape) more_ttl = shape.format_params(theta, series.age_scaler, latex=True) if more_ttl: ttl = '\n'.join([ttl, more_ttl]) # draw the overall fit score = cfg.score(y, shape.f(theta, x)) x_smooth, y_smooth = shape.high_res_preds(theta, x) label = 'fit ({}={:.3g})'.format(cfg.score_type, score) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label) # draw LOO predictions and residuals if LOO_predictions is not None: score = loo_score(y, LOO_predictions) for i, (xi, yi, y_loo) in enumerate(zip(x, y, LOO_predictions)): if y_loo is None or np.isnan(y_loo): continue label = 'LOO ({}={:.3g})'.format(cfg.score_type, score) if i == 0 else None ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label) ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) if show_legend and not minimal_annotations: ax.legend(fontsize=fontsize, frameon=False) if not minimal_annotations: ax.tick_params(axis='y', labelsize=fontsize) if not b_subplot: ax.set_title(ttl, fontsize=fontsize) return ax.figure
fits = [] for i,g in enumerate(series.gene_names): print 'Fitting series {}...'.format(i+1) theta, sigma, LOO_predictions,_ = fitter.fit(x,y[:,i],loo=True) fit = Bunch( theta = theta, LOO_predictions = LOO_predictions, ) fits.append(fit) print 'Fitting with correlations...' levels = fitter.fit_multi(x, y, loo=True, n_iterations=2) res = levels[-1] print 'Theta:' for ti in res.theta: print ' {}'.format(ti) print 'Sigma:' print res.sigma plot_series(series, fitter.shape, res.theta, res.LOO_predictions) R2_pairs = [] for i,g in enumerate(series.gene_names): y_real = y[:,i] y_basic = fits[i].LOO_predictions y_multi_gene = res.LOO_predictions[:,i] # no NANs in the generated data, so no need to handle the original_inds mess basic_R2 = loo_score(y_real,y_basic) multi_gene_R2 = loo_score(y_real,y_multi_gene) R2_pairs.append( (basic_R2, multi_gene_R2) ) plot_comparison_scatter(R2_pairs,series.region_name) print 'R2_pairs = {}'.format(R2_pairs)
def plot_one_series(series, shape, theta, yrange=None, b_annotate=False, train_mask=None, test_preds=None, show_title=False): x = series.ages y = series.single_expression xmin, xmax = min(x), max(x) xmin = max(xmin, -2) if train_mask is None: train_mask = ~np.isnan(x) fig = plt.figure() ax = fig.add_axes([0.08, 0.15, 0.85, 0.8]) # plot the data points if not b_annotate: ax.plot(x[train_mask], y[train_mask], 'ks', markersize=8) if yrange is None: ymin, ymax = ax.get_ylim() else: ymin, ymax = yrange if not b_annotate: # mark birth time with a vertical line birth_age = series.age_scaler.scale(0) ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85') if theta is not None: # draw the overall fit x_smooth, y_smooth = shape.high_res_preds(theta, x) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3) # plot left out points and prediction error for xi, yi in zip(x[~train_mask], y[~train_mask]): y_loo = shape.f(theta, xi) ax.plot(xi, yi, 'rs', markersize=8) ax.plot([xi, xi], [yi, y_loo], '-', color='0.5') ax.plot(xi, y_loo, 'rx', markeredgewidth=2) if test_preds is not None: for xi, yi, y_loo in zip(x, y, test_preds): ax.plot([xi, xi], [yi, y_loo], '-', color='0.5') ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) score = loo_score(y, test_preds) txt = "$R^2 = {:.2g}$".format(score) ax.text(0.02, 0.8, txt, fontsize=equation_fontsize, transform=ax.transAxes) if b_annotate: # annotate sigmoid parameters arrow_color = 'green' a, h, mu, w = theta # onset y_onset = shape.f(theta, mu) ax.plot([mu, mu], [ymin, y_onset], 'g--', linewidth=2) ax.text(mu + 0.05, y_onset - 0.5, 'onset', fontsize=fontsize, horizontalalignment='left') # baseline ax.plot([xmin, xmax], [a, a], 'g--', linewidth=2) ax.text(mu + 1.5, a + 0.05, 'baseline', fontsize=fontsize, verticalalignment='bottom') # slope dx = 0.5 dy = dx * h / (4 * w) # that's df/dx at x=mu ax.plot([mu - dx, mu + dx], [y_onset - dy + 0.05, y_onset + dy + 0.05], 'g--', linewidth=2) ax.text(mu - 0.5, y_onset + 1, 'slope', fontsize=fontsize, horizontalalignment='right') ax.arrow(mu - 0.45, y_onset + 0.95, 0.65, -0.65, length_includes_head=True, width=0.005, facecolor=arrow_color) #height xpos = mu + 4 * w ax.text(xpos + 0.05, y_onset, 'height', fontsize=fontsize, verticalalignment='center') ax.arrow(xpos, y_onset, 0, h * 0.45, length_includes_head=True, width=0.005, facecolor=arrow_color) ax.arrow(xpos, y_onset, 0, -h * 0.45, length_includes_head=True, width=0.005, facecolor=arrow_color) ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax) # title if show_title: ttl = '{}@{}, {} fit'.format(series.gene_name, series.region_name, shape) ax.set_title(ttl, fontsize=fontsize) # set the development stages as x labels ax.set_xlabel('age', fontsize=fontsize) stages = [stage.scaled(series.age_scaler) for stage in dev_stages] ax.set_xticks([stage.central_age for stage in stages]) ax.set_xticklabels([stage.short_name for stage in stages], fontsize=xtick_fontsize, fontstretch='condensed', rotation=90) # set y ticks (first and last only) ax.set_ylabel('expression level', fontsize=fontsize) ticks = ax.get_yticks() ticks = np.array([ticks[0], ticks[-1]]) ax.set_yticks(ticks) ax.set_yticklabels(['{:g}'.format(t) for t in ticks], fontsize=fontsize) return fig
def plot_one_series(series, shape, theta, yrange=None, b_annotate=False, train_mask=None, test_preds=None, show_title=False): x = series.ages y = series.single_expression xmin, xmax = min(x), max(x) xmin = max(xmin,-2) if train_mask is None: train_mask = ~np.isnan(x) fig = plt.figure() ax = fig.add_axes([0.08,0.15,0.85,0.8]) # plot the data points if not b_annotate: ax.plot(x[train_mask],y[train_mask], 'ks', markersize=8) if yrange is None: ymin, ymax = ax.get_ylim() else: ymin, ymax = yrange if not b_annotate: # mark birth time with a vertical line birth_age = series.age_scaler.scale(0) ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85') if theta is not None: # draw the overall fit x_smooth,y_smooth = shape.high_res_preds(theta,x) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3) # plot left out points and prediction error for xi,yi in zip(x[~train_mask],y[~train_mask]): y_loo = shape.f(theta,xi) ax.plot(xi,yi, 'rs', markersize=8) ax.plot([xi, xi], [yi, y_loo], '-', color='0.5') ax.plot(xi, y_loo, 'rx', markeredgewidth=2) if test_preds is not None: for xi,yi,y_loo in zip(x,y,test_preds): ax.plot([xi, xi], [yi, y_loo], '-', color='0.5') ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) score = loo_score(y,test_preds) txt = "$R^2 = {:.2g}$".format(score) ax.text(0.02,0.8,txt,fontsize=equation_fontsize,transform=ax.transAxes) if b_annotate: # annotate sigmoid parameters arrow_color = 'green' a,h,mu,w = theta # onset y_onset = shape.f(theta, mu) ax.plot([mu,mu],[ymin,y_onset],'g--',linewidth=2) ax.text(mu+0.05,y_onset-0.5,'onset', fontsize=fontsize, horizontalalignment='left') # baseline ax.plot([xmin,xmax],[a, a],'g--',linewidth=2) ax.text(mu+1.5,a+0.05,'baseline', fontsize=fontsize, verticalalignment='bottom') # slope dx = 0.5 dy = dx*h/(4*w) # that's df/dx at x=mu ax.plot([mu-dx,mu+dx],[y_onset-dy+0.05, y_onset+dy+0.05],'g--',linewidth=2) ax.text(mu-0.5,y_onset+1,'slope', fontsize=fontsize, horizontalalignment='right') ax.arrow(mu-0.45,y_onset+0.95,0.65,-0.65, length_includes_head=True, width=0.005, facecolor=arrow_color) #height xpos = mu + 4*w ax.text(xpos+0.05,y_onset,'height', fontsize=fontsize, verticalalignment='center') ax.arrow(xpos,y_onset,0,h*0.45, length_includes_head=True, width=0.005, facecolor=arrow_color) ax.arrow(xpos,y_onset,0,-h*0.45, length_includes_head=True, width=0.005, facecolor=arrow_color) ax.set_xlim(xmin,xmax) ax.set_ylim(ymin,ymax) # title if show_title: ttl = '{}@{}, {} fit'.format(series.gene_name, series.region_name, shape) ax.set_title(ttl, fontsize=fontsize) # set the development stages as x labels ax.set_xlabel('age', fontsize=fontsize) stages = [stage.scaled(series.age_scaler) for stage in dev_stages] ax.set_xticks([stage.central_age for stage in stages]) ax.set_xticklabels([stage.short_name for stage in stages], fontsize=xtick_fontsize, fontstretch='condensed', rotation=90) # set y ticks (first and last only) ax.set_ylabel('expression level', fontsize=fontsize) ticks = ax.get_yticks() ticks = np.array([ticks[0], ticks[-1]]) ax.set_yticks(ticks) ax.set_yticklabels(['{:g}'.format(t) for t in ticks], fontsize=fontsize) return fig