def write_data(self, filename='sim.data.R', n_model=100): data = dict( N = len(self._ep_obs), N_grbs = self._N_grbs, grb_length = self._grb_lengths, gamma_mu = self._gamma_mu, gamma_sigma = self._gamma_sigma, norm_mu = self._norm_mu, norm_sigma = self._norm_sigma, gamma = self._gammas, norm = self._norms, ep_obs = self._ep_obs, ep_err = self._ep_err, lum_obs = self._luminosity_obs, lum_err = self._luminosity_err, N_model = int(n_model), ep_model=np.linspace(.5,3.5,n_model) ) pystan.stan_rdump(data,filename)
# Let's consider an example in the context of Bayesian inference! # We first simulate an observation and save it to a file N = 1 simu_data = dict(N=N) simu_model = stan_utility.compile_model('simulate_data.stan') simu = simu_model.sampling(data=simu_data, iter=1, chains=1, seed=4838282, algorithm="Fixed_param") data = dict(N=N, y=simu.extract()['y'].flatten()) pystan.stan_rdump(data, 'simulation.data.R') # Now we can read that data back in and use Hamiltonian # Monte Carlo to estimate posterior expectation values input_data = pystan.read_rdump('simulation.data.R') model = stan_utility.compile_model('fit_data.stan') fit = model.sampling(data=input_data, seed=4938483) # Check diagnostics stan_utility.check_all_diagnostics(fit) # That doesn't look good. Let's investigate the divergent # samples in the context of the non-divergent samples to # see what's going on. nondiv_params, div_params = stan_utility.partition_div(fit)
models_store['%s/formula_variables'%hdf_label] = pd.Series(formula_variables) for count_variable in count_variables: baseline = count_variable+'_Mean_for_Year_and_Class_of_New_Immigrants_to_Class' predictors = dmatrix(formula, entries.ix[ind]) stan_data = {'y': asarray(entries.ix[ind, count_variable].astype('int')), 'x': asarray(predictors), 'N': N, 'K': predictors.shape[1], 'baseline': asarray(entries.ix[ind,baseline]) } data_file = 'counts_data_{0}_{1}.stan'.format(count_variable, n_observations) stan_rdump(stan_data, model_directory+data_file) submit_cmdstan_jobs(model,data_file) # In[12]: model = 'joint_counts_sampling_model' hdf_label = 'joint_counts' formula_variables = ['Relatedness', 'np.power(Relatedness, 2)', 'Popularity', 'np.power(Popularity, 2)', 'log(Agent_Previous_Citations_to_Class+1)', 'log(Agent_Productivity_Patents)', 'log(CoAgent_Previous_Patent_Count_in_Class+1)', 'Guided',
# # One-dimensional # ############################################################ ############################################################ # Create data ############################################################ model = stan_utility.compile_model('generate_data.stan') fit = model.sampling(seed=194838, algorithm='Fixed_param', iter=1, chains=1) data = dict(N = fit.extract()['N'].astype(numpy.int64), x_obs = fit.extract()['x_obs'][0,:]) pystan.stan_rdump(data, 'selection.data.R') ############################################################ # Fit model ############################################################ data = pystan.read_rdump('selection.data.R') model = stan_utility.compile_model('selection.stan') fit = model.sampling(data=data, chains=4, seed=4938483, control=dict(adapt_delta=0.9, max_treedepth=12)) # Check diagnostics stan_utility.check_all_diagnostics(fit) # Default visual summaries
def sampling(self, data=None, chains=4, iter=2000, warmup=None, thin=1, \ save_warmup=False, sample_file=None, algorithm=None, wait_during_sampling=False, \ init=None, init_file=None, output_file=None, args=None): # generate .data.R file for data if ((data is not None) and (sample_file is not None)) or ((data is None) and (sample_file is None)) : raise Exception('Exactly one of data or sample_file must be specified.') if data is not None: if isinstance(data, dict): data_dict = data elif isinstance(data, pandas.DataFrame): data_dict = data.to_dict() else: raise Exception('data must be a dict or a pandas.DataFrame.') self.sample_file = '.input.data.R' pystan.stan_rdump(data_dict, self.sample_file) elif sample_file is not None: self.sample_file = sample_file # generate .init.param.R if specified if (init is not None) and (init_file is not None): raise Exception('Initial parameters should be specified either by init (dict) or init_file (file name)') init_command = '' if init_file is not None: self.init_file = init_file init_command = ' init='+self.init_file+' ' elif init is not None: if isinstance(init, dict): init_dict = init elif isinstance(init, pandas.DataFrame): init_dict = init.to_dict() else: raise Exception('init must be a dict or a pandas.DataFrame.') self.init_file = '.init.param.R' pystan.stan_rdump(init_dict, self.init_file) init_command = ' init='+self.init_file+' ' # num_samples and num_warmup definitions if warmup is None: warmup = iter // 2 num_samples = iter - warmup # PyStanとCmdStanで指定の仕方が違うので注意。CmdStanのnum_samplesはwarmup後のデータ数 num_warmup = warmup # algorithm definition if algorithm == 'Fixed_param': algorithmAndEigine = 'algorithm=fixed_param' elif (algorithm == 'NUTS') or (algorithm is None): algorithmAndEigine = 'algorithm=hmc engine=nuts' elif algorithm == 'HMC': algorithmAndEigine = 'algorithm=hmc engine=static' else: raise Exception('algorithm must be one of Fixed_param, NUTS (default), and HMC.') # output file if output_file is None: output_file = 'output' for i in range(chains): command = '' command += './' + self.model_name + ' id='+str(i+1)+ ' sample ' command += 'num_samples=' + str(num_samples) + ' num_warmup=' + str(num_warmup) if save_warmup is True: command += ' save_warmup=1' command += ' ' + algorithmAndEigine if args is not None: command += ' ' + args command += ' data file=' + self.sample_file + ' output file=' + output_file + str(i+1) + '.csv' command += init_command # if wait_during_sampling is true, the final '&' will be omitted. if (wait_during_sampling == False) or (i < chains-1): command += '&' command += '\n' os.system(command) print command outputFiles = [] for i in range(1, chains+1): outputFiles.append(output_file + str(i) + '.csv') return StanFit4model(outputFiles)
def variational(self, data=None, sample_file=None, \ algorithm='meanfield', iter=10000, \ grad_samples=1, elbo_samples=100, eta=1.0, \ tol_rel_obj=0.01, output_samples=1000, \ init=None, init_file=None, output_file=None,\ args=None): """ interface of the variational inference """ if ((data is not None) and (sample_file is not None)) or ((data is None) and (sample_file is None)) : raise Exception('Exactly one of data or sample_file must be specified.') if data is not None: if isinstance(data, dict): data_dict = data elif isinstance(data, pandas.DataFrame): data_dict = data.to_dict() else: raise Exception('data must be a dict or a pandas.DataFrame.') sampleFileName = '.input.data.R' pystan.stan_rdump(data_dict, sampleFileName) elif sample_file is not None: sampleFileName = sample_file # generate .init.param.R if specified if (init is not None) and (init_file is not None): raise Exception('Initial parameters should be specified either by init (dict) or init_file (file name)') init_command = '' if init_file is not None: self.init_file = init_file init_command = ' init='+self.init_file+' ' elif init is not None: if isinstance(init, dict): init_dict = init elif isinstance(init, pandas.DataFrame): init_dict = init.to_dict() else: raise Exception('init must be a dict or a pandas.DataFrame.') self.init_file = '.init.param.R' pystan.stan_rdump(init_dict, self.init_file) init_command = ' init='+self.init_file+' ' # output file if output_file is None: output_file = 'output' command = '' command += './' + self.model_name + ' variational' command += ' algorithm=' +algorithm.lower() command += ' iter=' + str(iter) if grad_samples is not None: command += ' grad_samples=' + str(grad_samples) if elbo_samples is not None: command += ' elbo_samples=' + str(elbo_samples) if eta is not None: command += ' eta=' + str(eta) command += ' tol_rel_obj=' + str(tol_rel_obj) if args is not None: command += ' ' + args command += ' output_samples=' + str(output_samples) command += ' data file=' + sampleFileName command += init_command command += ' output file=' + output_file + '.csv' os.system(command) # this generates a output.csv as default outputFiles = [] outputFiles.append(output_file + '.csv') return StanFit4model(outputFiles)
def optimizing(self, data=None, sample_file=None, algorithm=None, iter=2000, \ init=None, init_file=None, output_file=None, args=None ): # generate .stan file if ((data is not None) and (sample_file is not None)) or ((data is None) and (sample_file is None)) : raise Exception('Exactly one of data or sample_file must be specified.') if data is not None: if isinstance(data, dict): data_dict = data elif isinstance(data, pandas.DataFrame): data_dict = data.to_dict() else: raise Exception('data must be a dict or a pandas.DataFrame.') sampleFileName = '.input.data.R' pystan.stan_rdump(data_dict, sampleFileName) elif sample_file is not None: sampleFileName = sample_file # generate .init.param.R if specified # generate .init.param.R if specified if (init is not None) and (init_file is not None): raise Exception('Initial parameters should be specified either by init (dict) or init_file (file name)') init_command = '' if init_file is not None: self.init_file = init_file init_command = ' init='+self.init_file+' ' elif init is not None: if isinstance(init, dict): init_dict = init elif isinstance(init, pandas.DataFrame): init_dict = init.to_dict() else: raise Exception('init must be a dict or a pandas.DataFrame.') self.init_file = '.init.param.R' pystan.stan_rdump(init_dict, self.init_file) init_command = ' init='+self.init_file+' ' # output file if output_file is None: output_file = 'output' if (algorithm is not None) and (isinstance(algorithm, str) is False): raise Exception('algorithm must be a string.') elif algorithm is None: algorithm = 'LBFGS' command = '' command += './' + self.model_name + ' optimize ' command += 'algorithm=' +algorithm.lower() if args is not None: command += ' ' + args command += ' iter=' + str(iter) command += ' data file=' + sampleFileName command += init_command command += ' output file=' + output_file + '.csv' os.system(command) # this generates a output.csv as default outputDataFrame = pandas.read_csv(output_file + '.csv', comment='#') retDict = outputDataFrame.to_dict() del retDict['lp__'] return collections.OrderedDict(retDict) # PyStanではOrderedDictを返すので真似た。
for count_variable in count_variables: baseline = count_variable + '_Mean_for_Year_and_Class_of_New_Immigrants_to_Class' predictors = dmatrix(formula, entries.ix[ind]) stan_data = { 'y': asarray(entries.ix[ind, count_variable].astype('int')), 'x': asarray(predictors), 'N': N, 'K': predictors.shape[1], 'baseline': asarray(entries.ix[ind, baseline]) } data_file = 'counts_data_{0}_{1}.stan'.format(count_variable, n_observations) stan_rdump(stan_data, model_directory + data_file) submit_cmdstan_jobs(model, data_file) # In[12]: model = 'joint_counts_sampling_model' hdf_label = 'joint_counts' formula_variables = [ 'Relatedness', 'np.power(Relatedness, 2)', 'Popularity', 'np.power(Popularity, 2)', 'log(Agent_Previous_Citations_to_Class+1)', 'log(Agent_Productivity_Patents)', 'log(CoAgent_Previous_Patent_Count_in_Class+1)', 'Guided', 'log(N_Agents)' ] formula = " + ".join(formula_variables) models_store['%s/formula_variables' % hdf_label] = pd.Series(formula_variables)
data_dict['N_all'] = int(total_number_of_channels_used) data_dict['N_grbs'] = len(np.unique(grb_number)) data_dict['max_n_echan'] = max_n_echans data_dict['max_n_chan'] = max_n_chans data_dict['N_dets'] = n_dets data_dict['N_chan'] = n_chan.astype(int) data_dict['N_echan'] = n_echan.astype(int) data_dict['observed_counts'] = observed_counts data_dict['background_counts'] = background_counts data_dict['idx_background_nonzero'] = idx_background_nonzero.astype(int) data_dict['idx_background_zero'] = idx_background_zero.astype(int) data_dict['N_bkg_zero'] = n_bkg_zero.astype(int) data_dict['N_bkg_nonzero'] = n_bkg_nonzero.astype(int) data_dict['object_idx'] = np.array(grb_number).astype(int) data_dict['background_errors'] = background_errors data_dict['ebounds_lo'] = ebounds_lo data_dict['ebounds_hi'] = ebounds_hi data_dict['cbounds_lo'] = cbounds_lo data_dict['cbounds_hi'] = cbounds_hi data_dict['exposure'] = exposures data_dict['response'] = responses data_dict['mask'] = masks.astype(int) data_dict['N_channels_used'] = n_channels_used.astype(int) data_dict['grb_id'] = np.array(grb_number).astype(int) data_dict['dl'] = dl data_dict['z'] = z pystan.stan_rdump(data_dict,'all_data.R')