Esempio n. 1
0
def quantile_accuracy(gene_targets, gene_preds, gene_stat, out_pdf, numq=4):
  ''' Plot accuracy (PearsonR) in quantile bins across targets. '''

  # plot PearsonR in variance statistic bins
  quant_indexes = stats.quantile_indexes(gene_stat, numq)

  quantiles_series = []
  targets_series = []
  pcor_series = []

  for qi in range(numq):
    # slice quantile
    gene_targets_quant = gene_targets[quant_indexes[qi]].astype('float32')
    gene_preds_quant = gene_preds[quant_indexes[qi]].astype('float32')

    # compute target PearsonR
    for ti in range(gene_targets_quant.shape[1]):
      pcor, _ = pearsonr(gene_targets_quant[:,ti],
                          gene_preds_quant[:,ti])

      quantiles_series.append(qi)
      targets_series.append(ti)
      pcor_series.append(pcor)

  # construct DataFrame
  df_quant = pd.DataFrame({'Quantile':quantiles_series,
                            'Target':targets_series,
                            'PearsonR':pcor_series})
  df_quant.to_csv('%s.csv' % out_pdf[:-4])

  # print summary table
  table_out = open('%s.txt' % out_pdf[:-4], 'w')
  for qi in range(numq):
    quantile_cors = df_quant[df_quant.Quantile == qi].PearsonR
    print('%2d  %.4f  %.4f' % \
          (qi, np.mean(quantile_cors),np.median(quantile_cors)),
          file=table_out)
  table_out.close()

  # construct figure
  plt.figure()

  # plot individual targets as light lines
  for ti in range(gene_targets.shape[1]):
    df_quant_target = df_quant[df_quant.Target == ti]
    plt.plot(df_quant_target.Quantile, df_quant_target.PearsonR, alpha=0.1)

  # plot PearsonR distributions in quantiles
  sns.violinplot(x='Quantile', y='PearsonR', data=df_quant, color='tomato')

  plt.savefig(out_pdf)
  plt.close()

  # sort targets by their decrease
  target_ratios = []
  for ti in range(gene_targets.shape[1]):
    df_quant_target = df_quant[df_quant.Target == ti]
    assert(df_quant_target.Quantile.iloc[0] == 0)
    assert(df_quant_target.Quantile.iloc[-1] == numq-1)
    cor_ratio = df_quant_target.PearsonR.iloc[-1] / df_quant_target.PearsonR.iloc[0]
    target_ratios.append((cor_ratio,ti))
  target_ratios = sorted(target_ratios)

  # take 10 samples across
  pct_indexes = np.linspace(0, len(target_ratios)-1, 10+1).astype('int')

  # write quantile targets
  table_out = open('%s_qt.txt' % out_pdf[:-4], 'w')
  sns.set(font_scale=1.2, style='ticks')

  # scatter plot each quantile
  for qi in range(numq):
    # slice quantile
    gene_targets_quant = gene_targets[quant_indexes[qi]].astype('float32')
    gene_preds_quant = gene_preds[quant_indexes[qi]].astype('float32')

    for pqi in range(len(pct_indexes)):
      pct_i = pct_indexes[pqi]
      ti = target_ratios[pct_i][1]

      print(qi, pqi, ti, target_ratios[ti], file=table_out)

      qout_pdf = '%s_pq%d_q%d.pdf' % (out_pdf[:-4], pqi, qi)
      plots.jointplot(gene_targets_quant[:,ti], gene_preds_quant[:,ti],
                              qout_pdf, alpha=0.8, point_size=8, kind='reg',
                              figsize=5, x_label='log2 Experiment',
                              y_label='log2 Prediction')

  table_out.close()
Esempio n. 2
0
def variance_accuracy(gene_targets, gene_preds, out_prefix, log_pseudo=None):
  """ Compare MSE accuracy to gene mean and variance.

    Assumes the targets and predictions have been normalized.
    """

  # compute mean, var, and MSE across targets
  print('gene_targets', gene_targets.shape)
  gene_mean = np.mean(gene_targets, axis=1, dtype='float64')
  gene_max = np.max(gene_targets, axis=1)
  gene_std = np.std(gene_targets, axis=1, dtype='float64')
  gene_mse = np.power(gene_targets - gene_preds, 2).mean(axis=1, dtype='float64')

  # filter for sufficient expression
  expr_indexes = (gene_mean > 0.5) & (gene_max > 3)
  gene_targets = gene_targets[expr_indexes,:]
  gene_preds = gene_preds[expr_indexes,:]
  gene_mse = gene_mse[expr_indexes]
  gene_mean = gene_mean[expr_indexes]
  gene_std = gene_std[expr_indexes]
  print('%d "expressed genes" considered in variance plots' % expr_indexes.sum())


  sns.set(style='ticks', font_scale=1.3)
  if len(gene_mse) < 2000:
    ri = np.arange(len(gene_mse))
  else:
    ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False)

  # plot mean vs std
  out_pdf = '%s_mean-std.pdf' % out_prefix
  plots.jointplot(gene_mean[ri], gene_std[ri], out_pdf, point_size=10,
    cor='spearmanr', x_label='Mean across experiments', y_label='Std Dev across experiments')

  # plot mean vs MSE
  out_pdf = '%s_mean.pdf' % out_prefix
  plots.jointplot(gene_mean[ri], gene_mse[ri], out_pdf, point_size=10,
    cor='spearmanr', x_label='Mean across experiments', y_label='Mean squared prediction error')

  # plot std vs MSE
  out_pdf = '%s_std.pdf' % out_prefix
  plots.jointplot(gene_std[ri], gene_mse[ri], out_pdf, point_size=10,
    cor='spearmanr', x_label='Std Dev across experiments', y_label='Mean squared prediction error')

  # plot CV vs MSE
  gene_cv = np.divide(gene_std, gene_mean)
  out_pdf = '%s_cv.pdf' % out_prefix
  plots.jointplot(gene_cv[ri], gene_mse[ri], out_pdf, point_size=10,
    cor='spearmanr', x_label='Coef Var across experiments', y_label='Mean squared prediction error')


  # plot MSE distributions in CV bins
  numq = 5
  quant_indexes = stats.quantile_indexes(gene_cv, numq)
  quant_mse = []
  for qi in range(numq):
    for gi in quant_indexes[qi]:
      quant_mse.append([qi, gene_mse[gi]])
  quant_mse = pd.DataFrame(quant_mse, columns=['Quantile','MSE'])

  quant_mse.to_csv('%s_quant.txt' % out_prefix, sep='\t')

  plt.figure()
  sns.boxplot(x='Quantile', y='MSE', data=quant_mse, palette=sns.cubehelix_palette(numq), showfliers=False)
  ax = plt.gca()
  ax.grid(True, linestyle=':')
  ax.set_ylabel('Mean squared prediction error')
  plt.savefig('%s_quant.pdf' % out_prefix)
  plt.close()

  # CV quantiles
  quantile_accuracy(gene_targets, gene_preds, gene_cv, '%s_qcv.pdf'%out_prefix, 4)

  # stdev quantiles
  quantile_accuracy(gene_targets, gene_preds, gene_std, '%s_qstd.pdf'%out_prefix, 4)
Esempio n. 3
0
def variance_accuracy(gene_targets, gene_preds, out_prefix, log=False):
    """ Compare MSE accuracy to gene mean and variance.

    Assumes the targets and predictions have been normalized.
    """

    # compute mean, var, and MSE across targets
    gene_mse = np.zeros(gene_targets.shape[0])
    gene_mean = np.zeros(gene_targets.shape[0])
    gene_max = np.zeros(gene_targets.shape[0])
    gene_std = np.zeros(gene_targets.shape[0])
    for gi in range(gene_targets.shape[0]):
        if log:
            gti = np.log2(gene_targets[gi, :] + 1)
            gpi = np.log2(gene_preds[gi, :] + 1)
        else:
            gti = gene_targets[gi, :]
            gpi = gene_preds[gi, :]

        gene_mse[gi] = np.power(gti - gpi, 2).mean()
        gene_mean[gi] = gti.mean()
        gene_max[gi] = gti.max()
        gene_std[gi] = gpi.std()

    # filter for expression
    expr_indexes = (gene_mean > 0.1) & (gene_max > 3)
    gene_mse = gene_mse[expr_indexes]
    gene_mean = gene_mean[expr_indexes]
    gene_std = gene_std[expr_indexes]
    print('%d "expressed genes" considered in variance plots' %
          expr_indexes.sum())

    sns.set(style='ticks', font_scale=1.3)

    # plot mean vs MSE
    out_pdf = '%s_mean.pdf' % out_prefix
    ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False)
    basenji.plots.jointplot(gene_mean[ri],
                            gene_mse[ri],
                            out_pdf,
                            point_size=10,
                            cor='spearmanr',
                            x_label='Mean across experiments',
                            y_label='Mean squared prediction error')

    # plot std vs MSE
    out_pdf = '%s_std.pdf' % out_prefix
    ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False)
    basenji.plots.jointplot(gene_std[ri],
                            gene_mse[ri],
                            out_pdf,
                            point_size=10,
                            cor='spearmanr',
                            x_label='Std Dev across experiments',
                            y_label='Mean squared prediction error')

    # plot CV vs MSE
    gene_cv = np.divide(gene_std, gene_mean)
    out_pdf = '%s_cv.pdf' % out_prefix
    ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False)
    basenji.plots.jointplot(gene_cv[ri],
                            gene_mse[ri],
                            out_pdf,
                            point_size=10,
                            cor='spearmanr',
                            x_label='Coef Var across experiments',
                            y_label='Mean squared prediction error')

    # plot MSE distributions in CV bins
    numq = 4
    quant_indexes = stats.quantile_indexes(gene_cv, numq)
    quant_mse = []
    for qi in range(numq):
        for gi in quant_indexes[qi]:
            quant_mse.append([qi, gene_mse[gi]])
    quant_mse = pd.DataFrame(quant_mse, columns=['Quantile', 'MSE'])

    quant_mse.to_csv('%s_quant.txt' % out_prefix, sep='\t')

    plt.figure()
    sns.boxplot(x='Quantile',
                y='MSE',
                data=quant_mse,
                palette=sns.cubehelix_palette(numq),
                showfliers=False)
    ax = plt.gca()
    ax.grid(True, linestyle=':')
    ax.set_ylabel('Mean squared prediction error')
    plt.savefig('%s_quant.pdf' % out_prefix)
    plt.close()