def compute_max_corr(strace): df = trace_to_dataframe(strace) max_corr = max_corr = np.max(np.tril(np.abs(np.corrcoef(df.values, rowvar=0)), k=-1), axis=1) # redundant = max_corr > 0.99 # red_names = df.columns.values[redundant] # print(red_names) return max_corr
def run(self) -> Tuple[int, pd.DataFrame]: """Bayesian inference.""" observed_data = self._extract_observed_data() self.mc.log_debug('{} reads (read pairs) are used for Bayesian inference.'.format(len(observed_data))) p_count = self.isoforms_count * self.ploidy with pm.Model(): p = pm.Dirichlet(name='p', a=tt.stack([1 for _ in range(p_count)]), shape=p_count) c = np.tile(self.isoform_lens, self.ploidy) p_rescaled = pm.Deterministic(name='p_rescaled', var=(p * c) / tt.dot(p, c)) _ObservedDistribution(name='observed', _p=p_rescaled, observed=observed_data) # Inference. trace = pm.sample(self.mcmc_samples, tune=self.tune_samples, chains=1, progressbar=False) # Convert trace to data frame. trace = trace_to_dataframe(trace) # Remove rescaled variables. cols = [col for col in trace.columns if not col.startswith('p_rescaled')] assert len(cols) == p_count trace = trace[cols] # Rename trace columns. rename_map = {} for i in range(self.isoforms_count): for j in range(self.ploidy): n = i + j * self.isoforms_count rename_map['p__{}'.format(n)] = 'RAW_I{}_A{}'.format(i, j) trace = trace.rename(columns=rename_map) return len(observed_data), trace
def dump(name, trace, chains=None): """ Store values from NDArray trace as CSV files. Parameters ---------- name : str Name of directory to store CSV files in trace : :class:`pymc3.backend.base.MultiTrace` of NDArray traces Result of MCMC run with default NDArray backend chains : list Chains to dump. If None, all chains are dumped. """ if not os.path.exists(name): os.mkdir(name) if chains is None: chains = trace.chains var_shapes = trace._straces[chains[0]].var_shapes flat_names = { v: ttab.create_flat_names(v, shape) for v, shape in var_shapes.items() } for chain in chains: filename = os.path.join(name, 'chain-{}.csv'.format(chain)) df = ttab.trace_to_dataframe(trace, chains=chain, flat_names=flat_names) df.to_csv(filename, index=False)
def format_trace(trace, to_df=True): """ Convert the trace into the necessary format. The current format is a numpy array. """ df = trace_to_dataframe(trace) if to_df: return df else: return pd.DataFrame.as_matrix(df)
def plot_corner(trace, outpath): # corner plot of posterior samples plt.close('all') corvars = [x for x in trace.varnames if x[-1] != '_'] trace_df = trace_to_dataframe(trace, varnames=corvars) fig = corner.corner(trace_df, quantiles=[0.16, 0.5, 0.84], show_titles=True, title_kwargs={"fontsize": 12}, title_fmt='.2g') savefig(fig, outpath, writepdf=0, dpi=100)
def test_trace_to_dataframe_chain_arg(self): mtrace = self.mtrace df = ttab.trace_to_dataframe(mtrace, chains=0) self.assertEqual(len(mtrace), df.shape[0]) checked = False for varname in self.test_point.keys(): vararr = mtrace.get_values(varname, chains=0) # With `shape` above, only one variable has to have that # `shape`. if vararr.shape[1:] != self.shape: continue npt.assert_equal(vararr[:, 0, 0], df[varname + '__0_0'].values) npt.assert_equal(vararr[:, 1, 0], df[varname + '__1_0'].values) npt.assert_equal(vararr[:, 1, 2], df[varname + '__1_2'].values) checked = True self.assertTrue(checked)
def test_trace_to_dataframe_chain_arg(self): mtrace = self.mtrace df = ttab.trace_to_dataframe(mtrace, chains=0) assert len(mtrace) == df.shape[0] checked = False for varname in self.test_point.keys(): vararr = mtrace.get_values(varname, chains=0) # With `shape` above, only one variable has to have that # `shape`. if vararr.shape[1:] != self.shape: continue npt.assert_equal(vararr[:, 0, 0], df[varname + '__0_0'].values) npt.assert_equal(vararr[:, 1, 0], df[varname + '__1_0'].values) npt.assert_equal(vararr[:, 1, 2], df[varname + '__1_2'].values) checked = True assert checked
def test_trace_to_dataframe(self): mtrace = self.mtrace df = ttab.trace_to_dataframe(mtrace) assert len(mtrace) * mtrace.nchains == df.shape[0] checked = False for varname in self.test_point.keys(): vararr = mtrace.get_values(varname) # With `shape` above, only one variable has to have that # `shape`. if vararr.shape[1:] != self.shape: continue npt.assert_equal(vararr[:, 0, 0], df[varname + "__0_0"].values) npt.assert_equal(vararr[:, 1, 0], df[varname + "__1_0"].values) npt.assert_equal(vararr[:, 1, 2], df[varname + "__1_2"].values) checked = True assert checked
def compute_max_fisher_scale(strace, model): epsilon = 1e-10 n_checks = 20 df = trace_to_dataframe(strace) idx_list = np.linspace(0, len(strace) - 1, n_checks, dtype=int) f = model.fastd2logp() # max_scale can be what we log max_scale = np.zeros(df.shape[1]) for idx in idx_list: FI = f(strace[idx]) FS = 0.5 * (FI + FI.T) # sym to be safe Q, R = np.linalg.qr(FS) max_scale = np.fmax(max_scale, np.abs(np.diag(R))) # redundant = max_scale < epsilon # red_names = df.columns.values[redundant] # print(red_names) return max_scale
# Priors theta_list = [] for label in NN.labels: if label in labels_to_fit: theta_list.append(pm.Uniform(label, lower=-0.5, upper=0.5)) else: theta_list.append(0.0) theta = tt.stack(theta_list) # Model inside = sigmoid(tt.tensordot(a=w0, b=theta, axes=1) + b0) middle = sigmoid(tt.tensordot(a=w1, b=inside, axes=1) + b1) outside = tt.tensordot(a=w2, b=middle, axes=1) + b2 model_spec = pm.Deterministic('model_spec', outside) # Likelihood spec = pm.Normal('spec', mu=model_spec, sd=1 / SNR, observed=spec_true) # Sampling backend = HDF5('../NN_data/trace.h5') trace = pm.sample(nsamples, tune=ntune, chains=chains, cores=cores, trace=backend) samples = pd.DataFrame(columns=NN.labels) samples[labels_to_fit] = trace_to_dataframe(trace, varnames=labels_to_fit) samples = NN.rescale_labels(samples) samples.to_hdf(samples_file, f'SNR={SNR}') fig = corner(samples[labels_to_fit], labels=labels_to_fit, show_titles=True) plt.savefig(f'{nn_file[:-4]}_{SNR}.png')
with open(pklpath, 'wb') as buff: pickle.dump( { 'model': model, 'trace': trace, 'map_estimate': map_estimate }, buff) else: d = pickle.load(open(pklpath, 'rb')) model, trace, map_estimate = d['model'], d['trace'], d['map_estimate'] ################## # analyze output # ################## # trace plot from PyMC3 plt.figure(figsize=(7, 7)) pm.traceplot(trace[100:]) plt.tight_layout() plt.savefig('../results/test_results/test_{}_traceplot.png'.format(modelid)) plt.close('all') # corner trace_df = trace_to_dataframe(trace) truths = [true_d[k] for k in list(trace_df.columns)] fig = corner.corner(trace_df, quantiles=[0.16, 0.5, 0.84], show_titles=True, title_kwargs={"fontsize": 12}, truths=truths) fig.savefig('../results/test_results/test_{}_corner.png'.format(modelid))
def get_abs_m(overwrite=0): # overwrite: whether to overwrite the pickle (~20s sampling) datestr = '20200624' cleanrvpath = os.path.join(DATADIR, 'spectra', 'RVs_{}_clean.csv'.format(datestr)) df = pd.read_csv(cleanrvpath) # NOTE: the FEROS data contain all the information about the long-term # trend. sel = (df.tel == "FEROS") df = df[sel] df = df.sort_values(by='time') delta_t = (df.time.max() - df.time.min()) t0 = df.time.min() + delta_t / 2 df['x'] = df['time'] - t0 # np.nanmean(df['time']) df['y'] = df['mnvel'] - np.nanmean(df['mnvel']) df['y_err'] = df['errvel'] force_err = 100 print('WRN: inflating error bars to account for rot jitter') print(f'{df.y_err.median()} to {force_err}') df.y_err = force_err pklpath = os.path.join(rdir, 'rvlim_method2.pkl') if os.path.exists(pklpath) and overwrite: os.remove(pklpath) # y = mx + c if not os.path.exists(pklpath): with pm.Model() as model: # Define priors c = pm.Uniform('c', lower=-100, upper=100) m = pm.Uniform('m', lower=-100, upper=100) abs_m = pm.Deterministic("abs_m", pm.math.abs_(m)) # Define likelihood # Here Y ~ N(Xβ, σ^2), for β the coefficients of the model. Note though # the error bars are not _observed_ in this case; they are part of the # model! likelihood = pm.Normal('y', mu=m * nparr(df.x) + c, sigma=nparr(df.y_err), observed=nparr(df.y)) # Inference! draw 1000 posterior samples using NUTS sampling n_samples = 6000 trace = pm.sample(n_samples, cores=16) with open(pklpath, 'wb') as buff: pickle.dump({'model': model, 'trace': trace}, buff) else: d = pickle.load(open(pklpath, 'rb')) model, trace = d['model'], d['trace'] # corner trace_df = trace_to_dataframe(trace) fig = corner.corner(trace_df, quantiles=[0.16, 0.5, 0.84], show_titles=True) fig.savefig(os.path.join(rdir, 'corner.png')) # data + model plt.close('all') plt.figure(figsize=(7, 7)) plt.scatter(df.x, df.y, label='data', zorder=2, color='k') plt.errorbar(df.x, df.y, yerr=df.y_err, ecolor='k', elinewidth=1, capsize=2, zorder=2, ls='none') N_samples = 100 lm = lambda x, sample: sample['m'] * x + sample['c'] for rand_loc in np.random.randint(0, len(trace), N_samples): rand_sample = trace[rand_loc] plt.plot(nparr(df.x), lm(nparr(df.x), rand_sample), zorder=1, alpha=0.5, color='C0') plt.legend(loc=0) plt.xlabel('time [d]') plt.ylabel('rv [m/s]') plt.savefig(os.path.join(rdir, 'datamodel.png')) plt.close('all') printparams = ['c', 'm', 'abs_m'] print(42 * '-') for p in printparams: med = np.percentile(trace[p], 50) up = np.percentile(trace[p], 84) low = np.percentile(trace[p], 36) threesig = np.percentile(trace[p], 99.7) print(f'{p} : {med:.3f} +{up-med:.3f} -{med-low:.3f}') print(f'{p} 99.7: {threesig:.3f}') print(42 * '-') absm_threesig = threesig * u.m / u.s / u.day delta_time = df.time.max() - df.time.min() return absm_threesig, delta_time
def format_trace(trace): df = trace_to_dataframe(trace) return df.values
sampled = 1 else: d = pickle.load(open(pklpath, 'rb')) model, trace, map_estimate = d['model'], d['trace'], d['map_estimate'] print(pm.summary(trace, varnames=["period", "t0", "r", "b", "u", "mean"])) true_d = OrderedDict( {'mean':1, 't0__0':t0s[0], 't0__1':t0s[1], 'period__0':periods[0], 'period__1':periods[1], 'u__0':us[0], 'u__1':us[1], 'r__0':rs[0], 'r__1':rs[1], 'b__0':bs[0], 'b__1':bs[1]} ) trace_df = trace_to_dataframe(trace, varnames=["period", "t0", "r", "b", "u", "mean"]) truths = [true_d[k] for k in true_d.keys()] fig = corner.corner(trace_df, quantiles=[0.16, 0.5, 0.84], show_titles=True, title_kwargs={"fontsize": 12}, truths=truths) fig.savefig('../results/test_results/test_{}_corner.png'.format(modelid)) # phase plots if sampled: for n, letter in enumerate("bc"): plt.figure() # Get the posterior median orbital parameters p = np.median(trace["period"][:, n]) t0 = np.median(trace["t0"][:, n]) # Compute the median of posterior estimate of the contribution from
def write_output(trace, uri, filepath, client): trace_dataframe = trace_to_dataframe(trace) trace_dataframe.to_parquet(filepath, engine="pyarrow", compression="snappy") client.file(uri).putFile(filepath)