def __initialize_BNN(self): """Initialize the BNN and set pretrained network weights (if supplied).""" # Generate BNN layer sizes if self.run_type != 'full_linear': bnn_layer_sizes = [self.num_dims+self.num_actions+self.weight_count] + [self.bnn_hidden_layer_size]*self.bnn_num_hidden_layers + [self.num_dims] else: bnn_layer_sizes = [self.num_dims+self.num_actions] + [self.bnn_hidden_layer_size]*self.bnn_num_hidden_layers + [self.num_dims*self.weight_count] # activation function relu = lambda x: np.maximum(x, 0.0) # Gather parameters param_set = { 'bnn_layer_sizes': bnn_layer_sizes, 'weight_count': self.weight_count, 'num_state_dims': self.num_dims, 'bnn_num_samples': self.bnn_num_samples, 'bnn_batch_size': self.bnn_batch_size, 'num_strata_samples': self.num_strata_samples, 'bnn_training_epochs': self.bnn_training_epochs, 'bnn_v_prior': self.bnn_v_prior, 'bnn_learning_rate': self.bnn_learning_rate, 'bnn_alpha': self.bnn_alpha, 'wb_learning_rate': self.wb_learning_rate, 'wb_num_epochs': self.wb_num_epochs } if self.run_type != 'full_linear': self.network = BayesianNeuralNetwork(param_set, nonlinearity=relu) else: self.network = BayesianNeuralNetwork(param_set, nonlinearity=relu, linear_latent_weights=True) # Use previously trained network weights if self.bnn_network_weights is not None: self.network.weights = self.bnn_network_weights
class HiPMDP(object): """ The HiP-MDP class can be used to: - Create a new batch of experience using agent learning a policy modelfree (run_type='modelfree', create_exp_batch=True) - Test one of the following methods on a single test instance: - Full HiP-MDP with embedded latent weights (run_type='full' and load pretrained bnn_network_weights) - Full HiP-MDP with linear top latent weights (run_type='full_linear' and load pretrained bnn_network_weights) - Average model (run_type='onesize' and load pretrained bnn_network_weights) - Model from scratch (run_type='modelbased') - Model-free (run_type='modelfree') """ def __init__(self, domain, preset_hidden_params, run_type='full', ddqn_learning_rate=0.0001, episode_count=500, bnn_hidden_layer_size=25, bnn_num_hidden_layers=2, bnn_network_weights=None, eps_min=0.15, test_inst=None, create_exp_batch=False, num_batch_instances=False, save_results=False, grid_beta=0.23, print_output=False): """ Initialize framework. Arguments: domain -- the domain the framework will be used on ('grid','hiv', or 'acrobot') preset_hidden_params -- List of dictionaries; one dictionary for each instance, where each dictionary contains the hidden parameter settings for that instance Keyword arguments: run_type -- 'full': Constructs a HiP-MDP model through which transfer is facilitated and with which accelerates policy learning, 'full_linear': Constructs a HiP-MDP model, but associates the latent weights w_b as a linear weighting of the model features rather than using them as input as is done in the full HiP-MDP model. 'modelfree': Learns a policy based solely on observed transitions, 'modelbased': builds a model for accelerating policy learning from only the current instance's data ddqn_learning_rate -- DQN ADAM learning rate (default=0.0001) episode_count -- Number of episodes per instance (default=500) bnn_hidden_layer_size -- Number of units in each hidden layer (default=25) bnn_num_hidden_layers -- Number hidden layers (default=2) bnn_network -- 1-D numpy array of pretrained BNN network weights (default=None) eps_min -- Minimum epsilon value for e-greedy policy (default=0.15) test_inst -- Index corresponding to the desired test instance; irrelevant when creating an experience batch (default=None) create_exp_batch -- Boolean indicating if this framework is for creating an experience batch (default=False) num_batch_instances -- number of instances desired in constructing a batch of data to train, default is false to cleanly override if not specified grid_beta -- Beta hyperparameter for grid domain governing; a weight on the magnitude of the "drift" (default=0.23) print_output -- Print verbose output """ self.__initialize_params() # Store arguments self.domain = domain self.ddqn_learning_rate = ddqn_learning_rate self.run_type = run_type if self.run_type in ['full','full_linear']: self.run_type_full = True else: self.run_type_full = False self.preset_hidden_params = preset_hidden_params self.bnn_hidden_layer_size = bnn_hidden_layer_size self.bnn_num_hidden_layers = bnn_num_hidden_layers self.bnn_network_weights = bnn_network_weights self.eps_min = eps_min self.test_inst = test_inst self.create_exp_batch = create_exp_batch self.num_batch_instances = num_batch_instances self.save_results = save_results self.grid_beta = grid_beta self.print_output = print_output # Set domain specific hyperparameters self.__set_domain_hyperparams() self.episode_count = episode_count # set epsilon step size if self.run_type == 'modelfree': self.eps_step = (self.eps_max-self.eps_min) / self.episode_count else: self.eps_step = (self.eps_max-self.eps_min) / self.num_approx_episodes def __initialize_params(self): """Initialize standard framework settings.""" self.instance_count = 1 # number of task instances self.episode_count = 500 # number of episodes self.weight_count = 5 # number of latent weights self.eps_max = 1.0 # initial epsilon value for e-greedy policy self.bnn_and_latent_update_interval = 10 # Number of episodes between BNN and latent weight updates self.num_strata_samples = 5 # The number of samples we take from each strata of the experience buffer self.ddqn_batch_size = 50 # The number of data points pulled from the experience buffer for replay self.tau = 0.005 # The transfer rate between our primary DQN and the target DQN self.discount_rate = 0.99 # RL discount rate of expected future rewards self.beta_zero = 0.5 # Initial bias correction parameter for Importance Sampling when doing prioritized experience replay self.bnn_num_samples = 50 # number of samples of network weights drawn to get each BNN prediction self.bnn_batch_size = 32 self.bnn_v_prior = 1.0 # Prior variance on the BNN parameters self.bnn_training_epochs = 100 # number of epochs of SGD in each BNN update self.num_episodes_avg = 30 # number of episodes used in moving average reward to determine whether to stop DQN training self.num_approx_episodes = 500 # number of approximated rollouts using the BNN to train the DQN self.state_diffs = True # Use BNN to predict (s'-s) rather than s' self.num_bnn_updates = 3 # number of calls to update_BNN() self.wb_learning_rate = 0.0005 # latent weight learning rate self.num_batch_updates = 5 # number of minibatch updates to DQN self.bnn_alpha = 0.5 # BNN alpha divergence parameter self.policy_update_interval = 10 # Main DQN update interval (in timesteps) self.target_update_interval = 10 # Target DQN update interval (in timesteps) self.ddqn_hidden_layer_sizes = [256,512] # DDQN hidden layer sizes self.eps_decay = 0.995 # Epsilon decay rate self.grad_clip = 2.5 # DDQN Gradient clip by norm self.ddqn_batch_size = 50 # DDQN batch size # Prioritized experience replay hyperparameters self.PER_alpha = 0.2 self.PER_beta_zero = 0.1 self.tau = 0.005 # DQN target network update proportion self.wb_num_epochs = 100 # number of epochs of SGD in each latent weight update def __set_domain_hyperparams(self): """Set domain specific hyperparameters.""" self.standardize_rewards = False self.standardize_states = False # Acrobot settings if self.domain == 'acrobot': from acrobot_simulator.acrobot import Acrobot as model if self.create_exp_batch: if self.num_batch_instances: self.instance_count = self.num_batch_instances else: self.instance_count = 8 # number of instances to include in experience batch self.max_task_examples = 400 # maximum number of time steps per episode self.min_avg_rwd_per_ep = -12 # minimum average reward before stopping DQN training self.bnn_learning_rate = 0.00025 self.num_initial_update_iters = 5 # number of initial updates to the BNN and latent weights self.bnn_start = 400 # number of time steps observed before starting BNN training self.dqn_start = 400 # number of time steps observed before starting DQN training # Grid settings elif self.domain == 'grid': from grid_simulator.grid import Grid as model if self.create_exp_batch: if self.num_batch_instances: self.instance_count = self.num_batch_instances else: self.instance_count = 2 self.max_task_examples = 100 self.min_avg_rwd_per_ep = 980 self.bnn_learning_rate = 0.00005 self.num_initial_update_iters = 10 # self.num_initial_update_iters = 1 self.num_approx_episodes = 1000 #Use extra approximated episodes since finding the goal state takes a bit of luck # self.num_approx_episodes = 3 self.bnn_start = 100 self.dqn_start = 100 self.wb_num_epochs = 300 if self.run_type_full: self.eps_decay = np.exp(np.log(self.eps_min)/self.num_approx_episodes) # In order to learn, model-based from scratch needs some adjustments if self.run_type == 'modelbased': self.bnn_learning_rate = 0.0005 self.dqn_start = 400 # HIV settings elif self.domain == 'hiv': from hiv_simulator.hiv import HIVTreatment as model if self.create_exp_batch: if self.num_batch_instances: self.instance_count = self.num_batch_instances else: self.instance_count = 5 self.max_task_examples = 200 self.min_avg_rwd_per_ep = 1e15 self.bnn_learning_rate = 0.00025 self.num_initial_update_iters = 10 self.bnn_start = 200 self.dqn_start = 200 self.standardize_rewards = True self.bnn_alpha = 0.45 # Alpha divergence hyper parameter self.bnn_batch_size = 100 # Draw 500 samples total self.standardize_states = True else: raise NameError('invalid domain') # Size of buffer for storing batch of experiences self.general_bnn_buffer_size = self.instance_count * self.max_task_examples * self.episode_count # Size of experience buffer for test instance. Note: all experiences are stored self.instance_buffer_size = self.max_task_examples * self.episode_count # Size of fictional experience buffer self.instance_fictional_buffer_size = self.num_approx_episodes * self.episode_count if self.domain == 'grid': self.task = model(beta=self.grid_beta) else: self.task = model() self.var_params = self.task.perturb_params # names of hidden parameters to be varied self.num_actions = self.task.num_actions # number of actions self.num_dims = len(self.task.observe()) # number of state dimensions # create set of parameters for each experience replay instantiation self.experience_replay_param_set = { 'episode_count': self.episode_count, 'instance_count': self.instance_count, 'max_task_examples': self.max_task_examples, 'ddqn_batch_size': self.ddqn_batch_size, 'num_strata_samples': self.num_strata_samples, 'PER_alpha': self.PER_alpha, 'PER_beta_zero': self.PER_beta_zero, 'bnn_batch_size': self.bnn_batch_size, 'dqn_start': self.dqn_start, 'bnn_start': self.bnn_start } def __get_instance_param_set(self): """Get preset hidden parameter setting for this instance.""" if self.create_exp_batch: instance_idx = self.instance_iter else: instance_idx = self.test_inst self.instance_param_set = self.preset_hidden_params[instance_idx] def __encode_action(self, action): """One-hot encodes the integer action supplied.""" a = np.array([0] * self.num_actions) a[action] = 1 return a def __load_reward_standardization(self): """Load the reward mean and standard deviation.""" with open('preset_parameters/'+self.domain+'_rewards_standardization','r') as f: self.rewards_standardization = pickle.load(f) def __load_state_standardization(self): """Load the state mean and standard deviation.""" with open('preset_parameters/'+self.domain+'_standardization_arrays','r') as f: self.state_mean, self.state_std = pickle.load(f) def __standardize_state(self,state): """Standardize and return the given state.""" return (state-self.state_mean) / self.state_std def __update_target_graph(self): """Helper function for updating target DQN.""" self.op_holder=[] total_vars = len(self.trainables) for idx , var in enumerate(self.trainables[0:int(total_vars/2)]): self.op_holder.append(self.trainables[idx + int(total_vars/2)].assign((var.value()*self.tau)+((1-self.tau) * self.trainables[idx + int(total_vars/2)].value()))) return self.op_holder def __update_target(self): """ Helper function for updating target DQN.""" for op in self.op_holder: self.sess.run(op) def __apply_minibatch_update(self): """Train the main DQN using minibatch updates.""" if self.run_type == 'modelfree': exp_buffer = self.real_buffer else: exp_buffer = self.fictional_buffer for batch_idx in range(self.num_batch_updates): # Draw experience sample and importance weights train_batch, is_weights, indices = exp_buffer.sample(self.instance_steps) # Calculate DDQN target feed_dict = {self.mainDQN.s:np.vstack(train_batch[:,3])} Q1 = self.sess.run(self.mainDQN.predict,feed_dict=feed_dict) feed_dict = {self.targetDQN.s:np.vstack(train_batch[:,3])} Q2 = self.sess.run(self.targetDQN.output,feed_dict=feed_dict) double_Q = Q2[range(train_batch.shape[0]),Q1] target_Q = train_batch[:,2] + self.discount_rate*double_Q # Calculate TD errors of the sample feed_dict = {self.mainDQN.s:np.vstack(train_batch[:,0]), self.mainDQN.next_Q:target_Q, self.mainDQN.action_array:np.vstack(train_batch[:,1])} td_loss = self.sess.run(self.mainDQN.td_loss, feed_dict=feed_dict) # Update priority queue with the observed td_loss from the selected minibatch and reinsert sampled batch into the priority queue if self.run_type == 'modelfree': self.real_buffer.update_priorities(np.hstack((np.reshape(td_loss,(len(td_loss),-1)), np.reshape(indices,(len(indices),-1))))) else: self.fictional_buffer.update_priorities(np.hstack((np.reshape(td_loss,(len(td_loss),-1)), np.reshape(indices,(len(indices),-1))))) # Update the DDQN feed_dict = {self.mainDQN.s:np.vstack(train_batch[:,0]), self.mainDQN.next_Q:target_Q, self.mainDQN.action_array:np.vstack(train_batch[:,1]), self.mainDQN.importance_weights:is_weights, self.mainDQN.learning_rate:self.ddqn_learning_rate } self.sess.run(self.mainDQN.updateQ, feed_dict=feed_dict) def __initialize_BNN(self): """Initialize the BNN and set pretrained network weights (if supplied).""" # Generate BNN layer sizes if self.run_type != 'full_linear': bnn_layer_sizes = [self.num_dims+self.num_actions+self.weight_count] + [self.bnn_hidden_layer_size]*self.bnn_num_hidden_layers + [self.num_dims] else: bnn_layer_sizes = [self.num_dims+self.num_actions] + [self.bnn_hidden_layer_size]*self.bnn_num_hidden_layers + [self.num_dims*self.weight_count] # activation function relu = lambda x: np.maximum(x, 0.0) # Gather parameters param_set = { 'bnn_layer_sizes': bnn_layer_sizes, 'weight_count': self.weight_count, 'num_state_dims': self.num_dims, 'bnn_num_samples': self.bnn_num_samples, 'bnn_batch_size': self.bnn_batch_size, 'num_strata_samples': self.num_strata_samples, 'bnn_training_epochs': self.bnn_training_epochs, 'bnn_v_prior': self.bnn_v_prior, 'bnn_learning_rate': self.bnn_learning_rate, 'bnn_alpha': self.bnn_alpha, 'wb_learning_rate': self.wb_learning_rate, 'wb_num_epochs': self.wb_num_epochs } if self.run_type != 'full_linear': self.network = BayesianNeuralNetwork(param_set, nonlinearity=relu) else: self.network = BayesianNeuralNetwork(param_set, nonlinearity=relu, linear_latent_weights=True) # Use previously trained network weights if self.bnn_network_weights is not None: self.network.weights = self.bnn_network_weights def __initialize_DDQN(self): """Initialize Double DQN.""" tf.compat.v1.reset_default_graph() self.mainDQN = Qnetwork(self.num_dims, self.num_actions, clip=self.grad_clip, activation_fn=tf.nn.relu, hidden_layer_sizes=self.ddqn_hidden_layer_sizes) self.targetDQN = Qnetwork(self.num_dims, self.num_actions, clip=self.grad_clip, activation_fn=tf.nn.relu, hidden_layer_sizes=self.ddqn_hidden_layer_sizes) init = tf.compat.v1.global_variables_initializer() self.trainables = tf.compat.v1.trainable_variables() self.targetOps = self.__update_target_graph() self.sess = tf.compat.v1.Session() self.sess.run(init) self.__update_target() def __e_greedy_policy(self,state): """Select action using epsilon-greedy policy.""" if np.random.rand(1) < self.eps: action = np.random.randint(0, self.num_actions) else: action = self.sess.run(self.mainDQN.predict,feed_dict={self.mainDQN.s:state.reshape(1,-1)})[0] self.action_counts += self.__encode_action(action) return action def __update_BNN(self): """Update BNN using data from test instance.""" self.network.fit_network(self.instance_bnn_buffer, self.full_task_weights, self.instance_steps, state_diffs=self.state_diffs, use_all_exp=False) if self.print_output: print('Updated BNN after episode {}'.format(self.episode_iter)) def __update_latent_weights(self): """Update Latent weights using data from test instance""" self.weight_set = self.network.optimize_latent_weighting_stochastic(self.instance_bnn_buffer, self.weight_set, self.instance_steps, state_diffs=self.state_diffs,use_all_exp=False) self.full_task_weights[self.instance_iter,:] = self.weight_set if self.print_output: print('Updated latent weights after episode {}'.format(self.episode_iter)) def __compute_bnn_training_error(self): """Compute BNN training error on most recent episode.""" exp = np.reshape(self.episode_buffer_bnn, (len(self.episode_buffer_bnn),-1)) episode_X = np.array([np.hstack([exp[tt,0],exp[tt,1]]) for tt in range(exp.shape[0])]) episode_Y = np.array([exp[tt,3] for tt in range(exp.shape[0])]) if self.state_diffs: # subtract previous state episode_Y -= episode_X[:,:self.num_dims] l2_errors = self.network.get_td_error(np.hstack([episode_X, np.tile(self.weight_set, (episode_X.shape[0],1))]), episode_Y, 0.0, 1.0) self.mean_episode_errors[self.instance_iter,self.episode_iter] = np.mean(l2_errors) self.std_episode_errors[self.instance_iter,self.episode_iter] = np.std(l2_errors) if self.print_output: print('BNN Error: {}'.format(self.mean_episode_errors[self.instance_iter,self.episode_iter])) def __sample_start_state(self): """Randomly choose and return a start state from a list of all observed start states""" return self.start_states[np.random.randint(0,len(self.start_states))] def run_fictional_episode(self): """Perform an episode using the BNN to approximate transitions without using the environment. Train DQN using this approximate experience data. """ ep_steps = 0 ep_reward = 0 # Sample start state from previously observed start states. state = np.copy(self.__sample_start_state()) r_fake = 0.0 # Keep track of the unstandardized reward since it is more interpretable if self.standardize_rewards: un_std_reward = 0.0 while ep_steps < self.max_task_examples: ep_steps += 1 self.approx_steps += 1 action = self.__e_greedy_policy(state) aug_state = np.hstack([state, self.__encode_action(action), self.weight_set.reshape(self.weight_set.shape[1],)]).reshape((1,-1)) # Note: if self.standardize_states==True, then the BNN output is a standardized state next_state = self.network.feed_forward(aug_state).flatten() if self.state_diffs: next_state += state # Undo the state standardization in order calculate the reward if self.standardize_states: reward_state = state*self.state_std + self.state_mean reward_next_state = next_state*self.state_std + self.state_mean else: reward_state = state reward_next_state = next_state if self.domain == 'grid': # In the grid domain, the reward is calculated as R(s,a) reward = self.task.calc_reward(action=action, state=reward_state, latent_code=self.instance_param_set) else: # In all other domains, the reward is calculated as R(s',a) reward = self.task.calc_reward(action=action, state=reward_next_state, latent_code=self.instance_param_set) if self.standardize_rewards: reward = (reward-self.reward_mean) / self.reward_std un_std_reward += reward r_fake += reward self.fictional_buffer.add(np.reshape(np.array([state,self.__encode_action(action),reward,next_state]),(1,4))) state = next_state if self.approx_steps >= self.dqn_start and self.train_dqn: # Update Main DQN if self.approx_steps % self.policy_update_interval == 0: self.__apply_minibatch_update() # Update Target DQN if self.approx_steps % self.target_update_interval == 0: self.__update_target() if self.print_output: print('Completed Instance {}, Approx. Episode {}'.format(self.instance_iter,self.approx_episode_iter)) if self.standardize_rewards: print('BNN Reward: {}'.format(un_std_reward)) else: print('BNN Reward: {}'.format(r_fake)) print('Action counts: {}'.format(self.action_counts)) def run_episode(self): """Run an episode on the environment (and train DQN if modelfree).""" self.task.reset(perturb_params = True, **self.instance_param_set) state = self.task.observe() if self.standardize_states: state = self.__standardize_state(state) self.start_states.append(state) self.episode_buffer_bnn = [] ep_reward = 0 # task is done after max_task_examples timesteps or when the agent enters a terminal state while not self.task.is_done(self.max_task_examples): self.total_steps += 1 self.instance_steps += 1 action = self.__e_greedy_policy(state) reward , next_state = self.task.perform_action(action, perturb_params=True, **self.instance_param_set) ep_reward += reward if self.standardize_rewards: reward = (reward-self.reward_mean) / self.reward_std if self.standardize_states: next_state = self.__standardize_state(next_state) if self.run_type == "modelfree": self.real_buffer.add(np.reshape(np.array([state, self.__encode_action(action), reward,next_state]),[1,4])) self.episode_buffer_bnn.append(np.reshape(np.array([state, self.__encode_action(action), reward,next_state, self.instance_iter]),[1,5])) state = next_state # For modelfree runs, update DQN using experience from the environment on set interval if self.run_type == 'modelfree' and self.instance_steps >= self.dqn_start and self.train_dqn: # Update Main DQN if self.instance_steps % self.policy_update_interval == 0: self.__apply_minibatch_update() # Update Target DQN if self.instance_steps % self.target_update_interval == 0: self.__update_target() # Store results at the end of the episode self.rewards[self.instance_iter,self.episode_iter] = ep_reward # calculate moving average reward last_rwds = self.rewards[self.instance_iter,np.maximum(self.episode_iter - self.num_episodes_avg + 1, 0):self.episode_iter + 1] self.avg_rwd_per_ep[self.instance_iter,self.episode_iter] = np.mean(last_rwds) # if creating experience batch, store experience in general buffer if self.create_exp_batch: self.general_bnn_buffer.add(np.reshape(self.episode_buffer_bnn, [-1,5])) # If using a model-based approach, store in instance buffer for updating BNN (and latent weights for full runs) if self.run_type != 'modelfree': self.instance_bnn_buffer.add(np.reshape(self.episode_buffer_bnn, [-1,5])) if self.print_output: print('Completed Instance {}, Episode {}'.format(self.instance_iter, self.episode_iter)) print('Total Reward: {}'.format(ep_reward)) print('Epsilon: {}'.format(self.eps)) print('Moving Average Reward: {}'.format(self.avg_rwd_per_ep[self.instance_iter, self.episode_iter])) print('Action counts: {}'.format(self.action_counts)) def run_instance(self): print('start of run instance') """Learn a policy and update the BNN (if desired) over the course of a single instance.""" # Get hidden parameter setting for this instance self.__get_instance_param_set() self.sys_param_set.append(self.instance_param_set) # Initialize latent weights # If full hipmdp run use random latent weights or mean latent weights if self.run_type_full: self.weight_set = np.atleast_2d(np.random.normal(0, 0.1, self.weight_count)) # Otherwise use ones else: self.weight_set = np.atleast_2d(np.ones(self.weight_count)) self.full_task_weights[self.instance_iter,:] = self.weight_set # Initialize DQN and BNN self.__initialize_DDQN() if self.run_type == 'modelfree': self.train_dqn = True self.train_bnn = False self.initial_bnn_collection = False else: self.__initialize_BNN() self.train_dqn = False self.initial_bnn_collection = True # initial collection period for BNN self.train_bnn = True # Initialize experience buffers if self.run_type == 'modelfree': # Prioritized experience replay used for training modelfree DQN off environment self.real_buffer = ExperienceReplay(self.experience_replay_param_set, buffer_size=self.instance_buffer_size) else: # Prioritized experience replay used for training model-based DQNs off approximated BNN rollouts self.fictional_buffer = ExperienceReplay(self.experience_replay_param_set, buffer_size=self.instance_fictional_buffer_size) # Prioritized experience replay used for training BNN (and latent weights) off environment self.instance_bnn_buffer = ExperienceReplay(self.experience_replay_param_set, buffer_type='BNN', buffer_size=self.instance_buffer_size) # Load Reward Standardization for this instance if self.standardize_rewards: if self.create_exp_batch: instance_idx = self.instance_iter else: instance_idx = self.test_inst self.reward_mean, self.reward_std = self.rewards_standardization[instance_idx] # Other initializations self.eps = self.eps_max # Set Epsilon self.start_states = [] self.action_counts = np.zeros(self.num_actions) # Store the counts of each on-policy action self.episode_iter = 0 # episode number self.instance_steps = 0 # number of steps this instance on the environment self.approx_steps = 0 # number of steps taking using approximated BNN rollout self.approx_episode_iter = 0 # approximated rollout number just_completed_first_update = False # Run episodes while self.episode_iter < self.episode_count: print('befor running episode') self.run_episode() if self.run_type != 'modelfree' and not self.initial_bnn_collection: self.run_fictional_episode() self.approx_episode_iter += 1 if self.run_type != 'modelfree': self.__compute_bnn_training_error() # Update BNN and latent weights on set interval if self.run_type != 'modelfree': if ((self.instance_steps >= self.bnn_start and self.initial_bnn_collection) or ((self.episode_iter+1) % self.bnn_and_latent_update_interval == 0) or just_completed_first_update) and self.train_bnn: # For full runs: oscillate between updating latent weights and updating BNN # For all other model-based benchmarks: only update BNN # Perform additional BNN/Latent weight updates after first before starting to train DQN # and after first set of approximated rollouts if self.initial_bnn_collection or just_completed_first_update: just_completed_first_update = not just_completed_first_update exp = np.reshape(self.episode_buffer_bnn, (len(self.episode_buffer_bnn),-1)) episode_X = np.array([np.hstack([exp[tt,0],exp[tt,1]]) for tt in range(exp.shape[0])]) episode_Y = np.array([exp[tt,3] for tt in range(exp.shape[0])]) if self.state_diffs: # subtract previous state episode_Y -= episode_X[:,:self.num_dims] for update_iter in range(self.num_initial_update_iters): if self.run_type_full: self.__update_latent_weights() l2_errors = self.network.get_td_error(np.hstack([episode_X, np.tile(self.weight_set, (episode_X.shape[0],1))]), episode_Y, 0.0, 1.0) if self.print_output: print('BNN Error after latent update iter {}: {}'.format(update_iter,np.mean(l2_errors))) self.__update_BNN() l2_errors = self.network.get_td_error(np.hstack([episode_X, np.tile(self.weight_set,(episode_X.shape[0],1))]), episode_Y, 0.0, 1.0) if self.print_output: print('BNN Error after BNN update iter {} : {}'.format(update_iter,np.mean(l2_errors))) else: for update_iter in range(self.num_bnn_updates): if self.run_type_full: self.__update_latent_weights() self.__update_BNN() # Start training DQN after dqn_start steps on the real environment if self.initial_bnn_collection and self.instance_steps >= self.dqn_start: print("approx episodes = ", self.num_approx_episodes) self.train_dqn = True self.initial_bnn_collection = False # Approximate Episodes while self.approx_episode_iter < self.num_approx_episodes: self.run_fictional_episode() self.approx_episode_iter += 1 # Decay Epsilon if self.eps > self.eps_min: self.eps *= self.eps_decay # Decay Epsilon if self.instance_steps > self.dqn_start and self.eps > self.eps_min: self.eps *= self.eps_decay # Stop training if good policy has been learned if (self.avg_rwd_per_ep[self.instance_iter, self.episode_iter] >= self.min_avg_rwd_per_ep) and (self.episode_iter + 1 >= self.num_episodes_avg): self.train_dqn = False self.eps = self.eps_min if self.print_output: print('Reached minimum average reward. Stopping training.') self.episode_iter += 1 # Close TensorFlow Session self.sess.close() def run_experiment(self): """Run the experiment: either creating a batch of experience or testing a method on a single instance.""" if self.create_exp_batch: self.general_bnn_buffer = ExperienceReplay(self.experience_replay_param_set, buffer_type='BNN', buffer_size=self.general_bnn_buffer_size, general=True, mem_priority=False ) # total reward per episode self.rewards = np.zeros((self.instance_count, self.episode_count)) # moving average of total rewards per episode self.avg_rwd_per_ep = np.zeros((self.instance_count, self.episode_count)) self.total_steps = 0 # Storage for latent weighting of each instance self.full_task_weights = np.zeros((self.instance_count, self.weight_count)) # Storage for hidden parameters for each instance self.sys_param_set = [] # Storage for BNN Training Errors self.mean_episode_errors = np.zeros((self.instance_count, self.episode_count)) self.std_episode_errors = np.zeros((self.instance_count, self.episode_count)) self.instance_iter = 0 if self.standardize_rewards: self.__load_reward_standardization() if self.standardize_states: self.__load_state_standardization() save_filename = self.domain + '_' + self.run_type + '_results_inst' while self.instance_iter < self.instance_count: self.run_instance() self.instance_iter += 1 # Save results networkweights = None if self.run_type != 'modelfree': networkweights = self.network.weights # Save off current results if self.create_exp_batch: exp_buffer = self.general_bnn_buffer.exp_buffer else: if self.run_type != 'modelfree': exp_buffer = self.instance_bnn_buffer.exp_buffer else: exp_buffer = self.real_buffer.exp_buffer if self.save_results: with open(self.domain + '_' + self.run_type + '_results_inst' + str(self.default_inst) + '_uaiHiP_larger_exp_replay_preload_{}'.format(self.run),'w') as f: pickle.dump((exp_buffer, networkweights, self.rewards, self.avg_rwd_per_ep, self.full_task_weights, self.sys_param_set, self.mean_episode_errors, self.std_episode_errors), f) return (exp_buffer, networkweights, self.rewards, self.avg_rwd_per_ep, self.full_task_weights, self.sys_param_set, self.mean_episode_errors, self.std_episode_errors)
'bnn_v_prior': 1.0, 'bnn_learning_rate': 0.00005, 'bnn_alpha': 0.5, 'wb_num_epochs': 1, 'wb_learning_rate': 0.0005 } # Initialize latent weights for each instance full_task_weights = np.random.normal( 0., 0.1, (batch_generator_hipmdp.instance_count, num_wb)) ## Initialize BNN network = BayesianNeuralNetwork(param_set, nonlinearity=relu) # print('BNN initialized') # Compute error before training l2_errors = network.get_td_error(np.hstack( (X, full_task_weights[inst_indices])), y, location=0.0, scale=1.0, by_dim=False) print("Before training: Mean Error: {}, Std Error: {}".format( np.mean(l2_errors), np.std(l2_errors))) np.mean(l2_errors), np.std(l2_errors) print("L2 Difference in latent weights between instances: {}".format( np.sum((full_task_weights[0] - full_task_weights[1])**2)))
def train_function(config, config_suffix=None): config_main = config['main'] seed = config_main['seed'] np.random.seed(seed) random.seed(seed) tf.set_random_seed(seed) domain = config_main['domain'] # 2D, acrobot, hiv if domain == '2D': domain_tk = 'grid' # they used 'grid' instead of '2D' else: domain_tk = domain if config_suffix is None: config_filename = "config_{}.json".format(domain) else: config_filename = "config_{}{}.json".format(domain, config_suffix) with open('../alg/{}'.format(config_filename)) as f: config_domain = json.load(f) config_bnn = config[domain] dir_name = '../results/%s' % config_main['dir_name'] if not os.path.isdir(dir_name): os.makedirs(dir_name) run_type = 'modelfree' num_batch_instances = config_bnn['num_batch_instances'] # 2, 8, 5 if domain == '2D': preset_hidden_params = [{'latent_code':1},{'latent_code':2}] elif domain == 'acrobot' or domain == 'hiv': with open('preset_parameters/%s'%config_domain['params_filename'],'r') as f: preset_parameters = pickle.load(f) preset_hidden_params = preset_parameters[:num_batch_instances] ddqn_learning_rate = 0.0005 episode_count = 500 bnn_hidden_layer_size = config_bnn['bnn_hidden_layer_size'] # 25, 32, 32 bnn_num_hidden_layers = config_bnn['bnn_num_hidden_layers'] # 3, 2, 2 bnn_network_weights = None eps_min = 0.15 test_inst = None create_exp_batch = True state_diffs = True grid_beta = 0.1 batch_generator_hipmdp = HiPMDP(domain_tk,preset_hidden_params, ddqn_learning_rate=ddqn_learning_rate, episode_count=episode_count, run_type=run_type, eps_min=eps_min, create_exp_batch=create_exp_batch, num_batch_instances=num_batch_instances, grid_beta=grid_beta, print_output=True, config_domain=config_domain) t_start = time.time() (exp_buffer, networkweights, rewards, avg_rwd_per_ep, full_task_weights, sys_param_set, mean_episode_errors, std_episode_errors) = batch_generator_hipmdp.run_experiment() with open('{}/{}_exp_buffer'.format(dir_name, domain),'w') as f: pickle.dump(exp_buffer,f) # with open('{}/{}_exp_buffer'.format(dir_name, domain),'r') as f: # exp_buffer = pickle.load(f) # Create numpy array exp_buffer_np = np.vstack(exp_buffer) # Collect the instances that each transition came from inst_indices = exp_buffer_np[:,4] inst_indices = inst_indices.astype(int) # Group experiences by instance # Create dictionary where keys are instance indexes and values are np.arrays experiences exp_dict = {} for idx in xrange(batch_generator_hipmdp.instance_count): exp_dict[idx] = exp_buffer_np[inst_indices == idx] X = np.array([np.hstack([exp_buffer_np[tt,0],exp_buffer_np[tt,1]]) for tt in range(exp_buffer_np.shape[0])]) y = np.array([exp_buffer_np[tt,3] for tt in range(exp_buffer_np.shape[0])]) num_dims = config_domain['n_state'] # 2, 4, 6 num_actions = config_domain['n_action'] # 4, 3, 4 num_wb = 5 if state_diffs: # subtract previous state y -= X[:,:num_dims] relu = lambda x: np.maximum(x, 0.) param_set = { 'bnn_layer_sizes': [num_dims+num_actions+num_wb]+[bnn_hidden_layer_size]*bnn_num_hidden_layers+[num_dims], 'weight_count': num_wb, 'num_state_dims': num_dims, 'bnn_num_samples': 50, 'bnn_batch_size': 32, 'num_strata_samples': 5, 'bnn_training_epochs': 1, 'bnn_v_prior': 1.0, 'bnn_learning_rate': config_bnn['bnn_learning_rate'], # 5e-5, 2.5e-4, 2.5e-4 'bnn_alpha': config_bnn['bnn_alpha'], # 0.5, 0.5, 0.45 'wb_num_epochs':1, 'wb_learning_rate':0.0005 } # Initialize latent weights for each instance full_task_weights = np.random.normal(0.,0.1,(batch_generator_hipmdp.instance_count,num_wb)) # Initialize BNN network = BayesianNeuralNetwork(param_set, nonlinearity=relu) # Compute error before training l2_errors = network.get_td_error(np.hstack((X,full_task_weights[inst_indices])), y, location=0.0, scale=1.0, by_dim=False) print ("Before training: Mean Error: {}, Std Error: {}".format(np.mean(l2_errors),np.std(l2_errors))) np.mean(l2_errors),np.std(l2_errors) print ("L2 Difference in latent weights between instances: {}".format(np.sum((full_task_weights[0]-full_task_weights[1])**2))) def get_random_sample(start,stop,size): indices_set = set() while len(indices_set) < size: indices_set.add(np.random.randint(start,stop)) return np.array(list(indices_set)) # size of sample to compute error on sample_size = 10000 for i in xrange(40): # Update BNN network weights network.fit_network(exp_buffer_np, full_task_weights, 0, state_diffs=state_diffs, use_all_exp=True) print('finished BNN update '+str(i)) if i % 4 == 0: #get random sample of indices sample_indices = get_random_sample(0,X.shape[0],sample_size) l2_errors = network.get_td_error(np.hstack((X[sample_indices],full_task_weights[inst_indices[sample_indices]])), y[sample_indices], location=0.0, scale=1.0, by_dim=False) print ("After BNN update: iter: {}, Mean Error: {}, Std Error: {}".format(i,np.mean(l2_errors),np.std(l2_errors))) # Update latent weights for inst in np.random.permutation(batch_generator_hipmdp.instance_count): full_task_weights[inst,:] = network.optimize_latent_weighting_stochastic( exp_dict[inst],np.atleast_2d(full_task_weights[inst,:]),0,state_diffs=state_diffs,use_all_exp=True) print ('finished wb update '+str(i)) # Compute error on sample of transitions if i % 4 == 0: #get random sample of indices sample_indices = get_random_sample(0,X.shape[0],sample_size) l2_errors = network.get_td_error(np.hstack((X[sample_indices],full_task_weights[inst_indices[sample_indices]])), y[sample_indices], location=0.0, scale=1.0, by_dim=False) print ("After Latent update: iter: {}, Mean Error: {}, Std Error: {}".format(i,np.mean(l2_errors),np.std(l2_errors))) # We check to see if the latent updates are sufficiently different so as to avoid fitting [erroneously] to the same dynamics print ("L2 Difference in latent weights between instances: {}".format(np.sum((full_task_weights[0]-full_task_weights[1])**2))) with open("{}/time.txt".format(dir_name), 'a') as f: f.write("%.5e" % (time.time() - t_start)) network_weights = network.weights with open('{}/{}_network_weights'.format(dir_name, domain), 'w') as f: pickle.dump(network.weights, f)
import numpy as np import matplotlib.pyplot as plt import sys sys.path.append("..") import BayesianNeuralNetwork as BNN import pymc3 as pm N = 5000 X = np.reshape(np.random.normal(0.0, 1.0, N), [-1, 1]) Y = (X**2.0) + 0.9 nn = BNN.BayesianNeuralNetwork([5, 5], output='normal', inference_method='advi') nn.fit(X, Y, samples=250, advi_n=15000, advi_obj_optimizer=pm.adam(learning_rate=.01)) y_preds = nn.predict(X) rmsd = nn.RMSD(X, Y) print "Root Mean Square deviation: %s" % rmsd # get the 10th and 90th percentiles of certainty y_generated = nn.generate(X, samples=250) pct10 = np.percentile(y_generated, q=10, axis=0) pct90 = np.percentile(y_generated, q=90, axis=0)
'bnn_v_prior': 1.0, 'bnn_learning_rate': 0.000025, 'bnn_alpha': 0.5, 'wb_num_epochs': 1, 'wb_learning_rate': 0.0005 } # Initialize latent weights for each instance full_task_weights = np.random.normal( 0., 0.1, (batch_generator_hipmdp.instance_count, num_wb)) # Initialize BNN network = BayesianNeuralNetwork(param_set, nonlinearity=relu) # Compute error before training l2_errors = network.get_td_error(np.hstack( (X, full_task_weights[inst_indices])), y, location=0.0, scale=1.0, by_dim=False) print("Before training: Mean Error: {}, Std Error: {}".format( np.mean(l2_errors), np.std(l2_errors))) np.mean(l2_errors), np.std(l2_errors) print("L2 Difference in latent weights between instances: {}".format( np.sum((full_task_weights[0] - full_task_weights[1])**2)))