def test_combine_first_dimensions(self): x = np.random.randint(100, size=(3, 5, 7, 9)) result = combine_first_dimensions(x) assert result.shape == (15, 7, 9) assert np.all(result[0] == x[0, 0]) assert np.all(result[6] == x[1, 1]) assert np.all(result[7] == x[1, 2]) x = np.random.randint(10, size=(3, 5)) assert np.all(combine_first_dimensions(x) == x.flatten())
def train(self, n_step_rewards, mb_obs_combined, mb_actions_combined ): feed_dict = { self.ph_value_target: n_step_rewards, self.ph_selected_spatial_action: mb_actions_combined["spatial_action"], self.ph_selected_action_id: mb_actions_combined["action_id"], self.ph_is_spatial_action_available: mb_actions_combined["is_spatial_action_available"] } feed_dict.update(self.obs_to_feeddict(mb_obs_combined)) # treat each timestep as a separate observation, so batch_size will become batch_size * timesteps feed_dict = {k: combine_first_dimensions(v) for k, v in feed_dict.items()} ops = [self.train_op] write_all_summaries = ( (self.train_step % self.all_summary_freq == 0) and self.summary_path is not None ) write_scalar_summaries = ( (self.train_step % self.scalar_summary_freq == 0) and self.summary_path is not None ) if write_all_summaries: ops.append(self.all_summary_op) elif write_scalar_summaries: ops.append(self.scalar_summary_op) r = self.sess.run(ops, feed_dict) if write_all_summaries or write_scalar_summaries: self.summary_writer.add_summary(r[-1], global_step=self.train_step) self.train_step += 1
def run_batch(self): mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) latest_obs = self.latest_obs # state(t0) rnn_state = self.agent.state_init for n in range(self.n_steps): action_ids, spatial_action_2ds, value_estimate, rnn_state = self.agent.step( latest_obs, rnn_state) #print('step: ', n, action_ids, spatial_action_2ds, value_estimate) # for debugging # Store actions and value estimates for all steps (this is being done in parallel with the other envs) mb_values[:, n] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_ids, spatial_action_2ds)) # Do action, return it to environment, get new obs and reward, store reward actions_pp = self.action_processer.process(action_ids, spatial_action_2ds) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processer.process(obs_raw) # state(t+1) mb_rewards[:, n] = [t.reward for t in obs_raw] #Check for all convolutional lstm vizdoom if last state is true for t in obs_raw: if t.last(): self._handle_episode_end(t) # Get the Vt+1 and use it as a future reward from st+1 as we dont know the actual reward (bootstraping here with net's predicitons) mb_values[:, -1] = self.agent.get_value(latest_obs, rnn_state) n_step_advantage = general_n_step_advantage(mb_rewards, mb_values, self.discount, lambda_par=1.0) full_input = { # these are transposed because action/obs # processers return [time, env, ...] shaped arrays FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() } # We combine all experiences from every env full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) full_input = { k: combine_first_dimensions(v) for k, v in full_input.items() } if not self.do_training: pass else: self.agent.train( full_input, rnn_state ) # You might want to reset the state between forward and backward pass self.latest_obs = latest_obs #state(t) = state(t+1) self.batch_counter += 1 print('Batch %d finished' % self.batch_counter) sys.stdout.flush()
def run_batch(self): mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) mb_rewards_modified = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) latest_obs = self.latest_obs for n in range(self.n_steps): # could calculate value estimate from obs when do training # but saving values here will make n step reward calculation a bit easier action_ids, spatial_action_2ds, value_estimate = self.agent.step( latest_obs) mb_values[:, n] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_ids, spatial_action_2ds)) actions_pp = self.action_processer.process(action_ids, spatial_action_2ds) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processer.process(obs_raw) mb_rewards[:, n] = [t.reward for t in obs_raw] # NEW i = 0 last_dist = self.last_min_dist_to_enemy #print(last_dist) curr_dist = min_distance_to_enemy(obs_raw[0], minimap=True) #print(curr_dist) if last_dist < INF and curr_dist < INF: mb_rewards_modified[:, n] = [ t.reward + (last_dist - curr_dist) / 20 for t in obs_raw ] self.last_min_dist_to_enemy = curr_dist ### for t in obs_raw: if t.last(): self._handle_episode_end(t) mb_values[:, -1] = self.agent.get_value(latest_obs) n_step_advantage = general_n_step_advantage( mb_rewards, mb_rewards_modified, mb_values, self.discount, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0) full_input = { # these are transposed because action/obs # processers return [time, env, ...] shaped arrays FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() } full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) full_input = { k: combine_first_dimensions(v) for k, v in full_input.items() } if not self.do_training: pass elif self.agent.mode == ACMode.A2C: self.agent.train(full_input) elif self.agent.mode == ACMode.PPO: for epoch in range(self.ppo_par.n_epochs): self._train_ppo_epoch(full_input) self.agent.update_theta() self.latest_obs = latest_obs self.batch_counter += 1 sys.stdout.flush()
def run_batch(self): #(MINE) MAIN LOOP!!! mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32) mb_done = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.int32) latest_obs = self.latest_obs # (MINE) =state(t) for n in range(self.n_steps): # could calculate value estimate from obs when do training # but saving values here will make n step reward calculation a bit easier action_ids, value_estimate = self.agent.step(latest_obs) print( '|step:', n, '|actions:', action_ids ) # (MINE) If you put it after the envs.step the SUCCESS appears at the envs.step so it will appear oddly # (MINE) Store actions and value estimates for all steps mb_values[:, n] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_ids)) # (MINE) do action, return it to environment, get new obs and reward, store reward #actions_pp = self.action_processer.process(action_ids) # Actions have changed now need to check: BEFORE: actions.FunctionCall(actions.FUNCTIONS.no_op.id, []) NOW: actions.FUNCTIONS.no_op() obs_raw = self.envs.step(action_ids) #obs_raw.reward = reward latest_obs = self.obs_processer.process( obs_raw[0] ) # For obs_raw as tuple! #(MINE) =state(t+1). Processes all inputs/obs from all timesteps (and envs) #print('-->|rewards:', np.round(np.mean(obs_raw[1]), 3)) mb_rewards[:, n] = [t for t in obs_raw[1]] mb_done[:, n] = [t for t in obs_raw[2]] #Check for all t (timestep/observation in obs_raw which t has the last state true, meaning it is the last state # IF MAX_STEPS OR GOAL REACHED # You can use as below for obs_raw[4] which is success of failure #print(obs_raw[2]) indx = 0 for t in obs_raw[2]: if t == True: # done=true # Put reward in scores epis_reward = obs_raw[3][indx]['episode']['r'] epis_length = obs_raw[3][indx]['episode']['l'] last_step_r = obs_raw[1][indx] self._handle_episode_end( epis_reward, epis_length, last_step_r ) # The printing score process is NOT a parallel process apparrently as you input every reward (t) independently indx = indx + 1 # finished envs count # for t in obs_raw: # if t.last(): # self._handle_episode_end(t) #print(">> Avg. Reward:",np.round(np.mean(mb_rewards),3)) mb_values[:, -1] = self.agent.get_value( latest_obs ) # We bootstrap from last step if not terminal! although he doesnt use any check here n_step_advantage = general_n_step_advantage( mb_rewards, mb_values, self.discount, mb_done, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0) full_input = { # these are transposed because action/obs # processers return [time, env, ...] shaped arrays FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose( ) # if you add to the advantage the value you get the target for your value function training. Check onenote in APL-virtual } #(MINE) Probably we combine all experiences from every worker below full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) full_input = { k: combine_first_dimensions(v) for k, v in full_input.items() } if not self.do_training: pass elif self.agent.mode == ACMode.A2C: self.agent.train(full_input) elif self.agent.mode == ACMode.PPO: for epoch in range(self.ppo_par.n_epochs): self._train_ppo_epoch(full_input) self.agent.update_theta() self.latest_obs = latest_obs self.batch_counter += 1 # It is used only for printing reasons as the outer while loop takes care to stop the number of batches print('Batch %d finished' % self.batch_counter) sys.stdout.flush()
def run_batch(self): mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) self.sprint.print( ["values,rewards=", mb_values.shape, mb_rewards.shape]) latest_obs = self.latest_obs for n in range(self.n_steps): # could calculate value estimate from obs when do training # but saving values here will make n step reward calculation a bit easier action_ids, spatial_action_2ds, value_estimate = self.agent.step( latest_obs) mb_values[:, n] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_ids, spatial_action_2ds)) actions_pp = self.action_processer.process(action_ids, spatial_action_2ds) self.aprint.print( ["test act_print", action_ids, spatial_action_2ds, actions_pp]) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processer.process(obs_raw) self.oprint.print([ "test obs_print", [(k, v.shape) for k, v in latest_obs.items()] ]) mb_rewards[:, n] = [t.reward for t in obs_raw] for t in obs_raw: if t.last(): self._handle_episode_end(t) mb_values[:, -1] = self.agent.get_value(latest_obs) n_step_advantage = general_n_step_advantage( mb_rewards, mb_values, self.discount, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0) full_input = { # these are transposed because action/obs # processers return [time, env, ...] shaped arrays FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() } full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) full_input = { k: combine_first_dimensions(v) for k, v in full_input.items() } self.batch_print.print( ["full_input=", [(k, v.shape) for (k, v) in full_input.items()]]) if not self.do_training: pass elif self.agent.mode == ACMode.A2C: self.agent.train(full_input) elif self.agent.mode == ACMode.PPO: for epoch in range(self.ppo_par.n_epochs): self._train_ppo_epoch(full_input) self.agent.update_theta() self.latest_obs = latest_obs self.batch_counter += 1 sys.stdout.flush()
def run_batch(self): """run_batch Run a batch of the training, building up a list of actions, observations, values of those actions and the rewards given. """ if self.number_episodes == self.episode_counter: print("Max number episodes reached. Quitting.") return False # Define variables to store the actions, observations, values and rewards in. mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) latest_obs = self.latest_obs # For the number of steps, save the relevant data each step. # When finished, deal with the episode end. for n_step in range(self.n_steps): # Save the value estimate here, to make the n step reward calculation easier. action_id, spatial_action_2d, value_estimate = self.agent.step(latest_obs) mb_values[:, n_step] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_id, spatial_action_2d)) actions_pp = self.action_processor.process(action_id, spatial_action_2d) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processor.process(obs_raw) mb_rewards[:, n_step] = [t.reward for t in obs_raw] for timestep in obs_raw: if timestep.last(): self._handle_episode_end(timestep) mb_values[:, -1] = self.agent.get_value(latest_obs) n_step_advantage = general_n_step_advantage( mb_rewards, mb_values, self.discount, lambda_par=1.0 ) full_input = { # These are transposed because action/obs # processors return [time, env, ...] shaped arrays. FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() } full_input.update(self.action_processor.combine_batch(mb_actions)) full_input.update(self.obs_processor.combine_batch(mb_obs)) full_input = {k: combine_first_dimensions(v) for k, v in full_input.items()} if not self.do_training: pass else: self.agent.train(full_input) self.latest_obs = latest_obs self.batch_counter += 1 sys.stdout.flush() return True
def run_meta_batch(self): mb_actions = [] mb_obs = [] mb_rnns = [] mb_values = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32) # n x d array (ndarray) mb_done = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.int32) # EVERYTHING IS HAPPENING ON PARALLEL!!! r_=np.zeros((self.envs.num_envs, 1), dtype=np.float32) # Instead of 1 you might use n_steps a_=np.zeros((self.envs.num_envs), dtype=np.int32) latest_obs = self.latest_obs # (MINE) =state(t) # rnn_state = self.agent.theta.state_init rnn_state = self.agent.theta.state_init for n in range(self.n_steps): action_ids, value_estimate, rnn_state_new = self.agent.step_recurrent(latest_obs, rnn_state, r_, a_) # Automatically returns [num_envs, outx] for each outx you want # print('|step:', n, '|actions:', action_ids) # (MINE) Store actions and value estimates for all steps mb_values[:, n] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_ids)) # (MINE) do action, return it to environment, get new obs and reward, store reward obs_raw = self.envs.step(action_ids) latest_obs = self.obs_processer.process(obs_raw[0]) # For obs_raw as tuple! #(MINE) =state(t+1). Processes all inputs/obs from all timesteps (and envs) mb_rnns.append(rnn_state) rnn_state = rnn_state_new r_ = obs_raw[1] # (nenvs,) but you need (nenvs,1) r_ = np.reshape(r_,[self.envs.num_envs,1]) # gets into recurrency as [nenvs,1] # The 1 might be used as timestep a_ = action_ids mb_rewards[:, n] = [t for t in obs_raw[1]] mb_done[:, n] = [t for t in obs_raw[2]] # Shouldnt this part below be OUT of the nstep loop? NO: You check if done=True and you extract the additional info that Monitor outputs indx=0 # env count for t in obs_raw[2]: # Monitor returns additional stuff such as epis_reward and epis_length etc apart the obs, r, done, info # obs_raw[2] = done = [True, False, False, True,...] each element corresponds to an env (index gives the env) if t == True: # done=true # Put reward in scores epis_reward = obs_raw[3][indx]['episode']['r'] epis_length = obs_raw[3][indx]['episode']['l'] # EPISODE LENGTH HERE!!! last_step_r = obs_raw[1][indx] self._handle_episode_end(epis_reward, epis_length, last_step_r) # The printing score process is NOT a parallel process apparrently as you input every reward (t) independently # Here you have to reset the rnn_state of that env (rnn_state has h and c EACH has dims [batch_size x hidden dims] rnn_state[0][indx] = np.zeros(256) rnn_state[1][indx] = np.zeros(256) #reset the relevant r_ and a_ r_[indx] = 0 a_[indx] = 0 indx = indx + 1 # finished envs count # Below: if all are zeros then you get nsteps mb_l = self.first_nonzero(mb_done,axis=1) # the last obs s-->sdone are not getting in the training!!!This is because sdone--> ? and R=0 # Substitute 0s with 1 declaring 1 step # mb_l[mb_l==0] = 1 for b in range(mb_l.shape[0]): if mb_l[b] < self.n_steps: mb_l[b] = mb_l[b] + 1 # You start stepping from step 0 to 1 mask = ~(np.ones(mb_rewards.shape).cumsum(axis=1).T > mb_l).T mb_rewards = mb_rewards*mask # Below: r + gammaV(s')(1-done) - V(s)// V(s')=mb_values[:,-1] but if its done we dont care if we put the last nstep V or the actual V(s_done+1) as the whole term is gonna be zero # From V(nstep) estimate the expected reward - what if though we finished the sequence earlier??? # This is for the last obs which you do not store. All other values that will be used as targets are available # mb_values[:, -1] = self.agent.get_recurrent_value(latest_obs, rnn_state, r_, a_) # Put at last slot the estimated future expected reward for bootstrap the V after the nsteps mask_v = ~(np.ones(mb_values.shape).cumsum(axis=1).T > mb_l).T mb_values = mb_values*mask_v # We take the vector of values after nsteps and we use ONLY the valid ones in the loop below vec = self.agent.get_recurrent_value(latest_obs, rnn_state, r_, a_) for b in range(mb_l.shape[0]): if mb_l[b] == self.n_steps: mb_values[b, -1] = vec[b] # if the sequence didnt end within nsteps then we add the value so the term gammaVt+1(1-done) is not 0 # Mask below the values that enter the nstep advantage n_step_advantage = general_nstep_adv_sequential( mb_rewards, mb_values, self.discount, mb_done, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0, nenvs=self.n_envs, maxsteps=self.n_steps ) # n_step_advantage = general_n_step_advantage( # mb_rewards, # mb_values, # self.discount, # mb_done, # lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0 # ) # prev_rewards = [0] + mb_rewards[:, :-1]#.tolist() # from the rewards you take out the last element and replace it with 0 prev_rewards = np.c_[np.zeros((self.envs.num_envs, 1), dtype=np.float32), mb_rewards[:, :-1]] # Below we add one zero action element and we take out the a_t so we get at=0:t-1 prev_actions = [np.zeros((self.envs.num_envs), dtype=np.int32)] + mb_actions[:-1] # You have to pad this probably to have equal lengths with your data in terms of nsteps full_input = { FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() # NETWORK TARGET (A+V=R) } full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) # full_input.update(self.action_processer.combine_batch(prev_actions)) # THIS FEEDS AGAIN THE SELECTED_ID_ACTION PLACEHOLDER!!!! # full_input.update(self.action_processer.combine_batch(prev_rewards)) # THIS FEEDS AGAIN THE SELECTED_ID_ACTION PLACEHOLDER!!!! # Below: [maxsteps x batch] --> [batch x maxsteps] full_input = {k: np.swapaxes(v, 0, 1) for k, v in full_input.items()} # obs should be first trasnposed and then combine else you loose the time order full_input = {k: combine_first_dimensions(v) for k, v in full_input.items()} # YOU CAN COMMENT THIS AND UNCOMMENT BELOW # full_input = {k: np.swapaxes(v,0,1) for k, v in full_input.items()} if not self.do_training: pass elif self.agent.mode == ACMode.A2C: if self.policy_type == MetaPolicy: self.agent.train_recurrent(full_input,mb_l,prev_rewards,prev_actions) else: self.agent.train(full_input) elif self.agent.mode == ACMode.PPO: for epoch in range(self.ppo_par.n_epochs): self._train_ppo_epoch(full_input) self.agent.update_theta() self.latest_obs = latest_obs self.batch_counter += 1 print('Batch %d finished' % self.batch_counter) sys.stdout.flush()
def run_factored_batch(self): #(MINE) MAIN LOOP!!! # The reset is happening through Monitor (except the first one of the first batch (is in hte run_agent) mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32) mb_values_goal = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32) mb_values_fire = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32) mb_rewards_goal = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32) mb_rewards_fire = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32) mb_done = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.int32) latest_obs = self.latest_obs # (MINE) =state(t) for n in range(self.n_steps): # could calculate value estimate from obs when do training # but saving values here will make n step reward calculation a bit easier action_ids, value_estimate_goal, value_estimate_fire, value_estimate = self.agent.step_factored(latest_obs) # print('|step:', n, '|actions:', action_ids) # (MINE) If you put it after the envs.step the SUCCESS appears at the envs.step so it will appear oddly if np.random.random() < 0.20: action_ids = np.array([self.envs.action_space.sample() for _ in range(self.envs.num_envs)]) # (MINE) Store actions and value estimates for all steps mb_values[:, n] = value_estimate mb_values_goal[:, n] = value_estimate_goal mb_values_fire[:, n] = value_estimate_fire mb_obs.append(latest_obs) mb_actions.append((action_ids)) # (MINE) do action, return it to environment, get new obs and reward, store reward #actions_pp = self.action_processer.process(action_ids) # Actions have changed now need to check: BEFORE: actions.FunctionCall(actions.FUNCTIONS.no_op.id, []) NOW: actions.FUNCTIONS.no_op() obs_raw = self.envs.step(action_ids) #obs_raw.reward = reward latest_obs = self.obs_processer.process(obs_raw[0]) # For obs_raw as tuple! #(MINE) =state(t+1). Processes all inputs/obs from all timesteps (and envs) #print('-->|rewards:', np.round(np.mean(obs_raw[1]), 3)) mb_rewards[:, n] = [t for t in obs_raw[1]] temp = [t for t in obs_raw[3]] mb_rewards_goal[:,n] = [d['goal'] for d in temp] mb_rewards_fire[:, n] = [d['fire'] for d in temp] mb_done[:, n] = [t for t in obs_raw[2]] # IF MAX_STEPS OR GOAL REACHED if obs_raw[2].any(): # At least one env has done=True for i in (np.argwhere(obs_raw[2])): # Run the loop for ONLY the envs that have finished indx = i[0] epis_reward = obs_raw[3][indx]['episode']['r'] epis_length = obs_raw[3][indx]['episode']['l'] last_step_r = obs_raw[1][indx] self._handle_episode_end(epis_reward, epis_length, last_step_r) # indx=0 # env count # for t in obs_raw[2]: # Monitor returns additional stuff such as epis_reward and epis_length etc apart the obs, r, done, info # #obs_raw[2] = done = [True, False, False, True,...] each element corresponds to an env # if t == True: # done=true # # Put reward in scores # epis_reward = obs_raw[3][indx]['episode']['r'] # epis_length = obs_raw[3][indx]['episode']['l'] # last_step_r = obs_raw[1][indx] # self._handle_episode_end(epis_reward, epis_length, last_step_r) # The printing score process is NOT a parallel process apparrently as you input every reward (t) independently # indx = indx + 1 # finished envs count # for t in obs_raw: # if t.last(): # self._handle_episode_end(t) #print(">> Avg. Reward:",np.round(np.mean(mb_rewards),3)) mb_values_goal[:, -1], mb_values_fire[:, -1], mb_values[:, -1] = self.agent.get_factored_value(latest_obs) # We bootstrap from last step. If not terminal! although he doesnt use any check here # R, G = calc_rewards(self, mb_rewards, mb_values[:,:-1], mb_values[:,1:], mb_done, self.discount) # R = R.reshape((self.n_envs, self.n_steps)) # self.n_envs returns 1, self.envs.num_envs # G = G.reshape((self.n_envs, self.n_steps)) # He replaces the last value with a bootstrap value. E.g. s_t-->V(s_t),s_{lastnstep}-->V(s_{lastnstep}), s_{lastnstep+1} # and we replace the V(s_{lastnstep}) with V(s_{lastnstep+1}) n_step_advantage_goal = general_nstep_adv_sequential( mb_rewards_goal, mb_values_goal, self.discount, mb_done, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0, nenvs=self.n_envs, maxsteps=self.n_steps ) n_step_advantage_fire = general_nstep_adv_sequential( mb_rewards_fire, mb_values_fire, self.discount, mb_done, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0, nenvs=self.n_envs, maxsteps=self.n_steps ) # n_step_advantage = n_step_advantage_goal + n_step_advantage_fire n_step_advantage = general_nstep_adv_sequential( mb_rewards, mb_values, self.discount, mb_done, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0, nenvs=self.n_envs, maxsteps=self.n_steps ) # disc_ret = calculate_n_step_reward( # mb_rewards, # self.discount, # mb_values, # ) full_input = { # these are transposed because action/obs # processers return [time, env, ...] shaped arrays CHECK THIS OUT!!!YOU HAVE ALREADY THE TIMESTEP DIM!!! FEATURE_KEYS.advantage: n_step_advantage.transpose(),#, G.transpose() FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose(), FEATURE_KEYS.value_target_goal: (n_step_advantage_goal + mb_values_goal[:, :-1]).transpose(),# R.transpose()# (we left out the last element) if you add to the advantage the value you get the target for your value function training. Check onenote in cmu_gym/Next Steps FEATURE_KEYS.value_target_fire: (n_step_advantage_fire + mb_values_fire[:, :-1]).transpose() } # full_input = { # # these are transposed because action/obs # # processers return [time, env, ...] shaped arrays CHECK THIS OUT!!!YOU HAVE ALREADY THE TIMESTEP DIM!!! # FEATURE_KEYS.advantage: n_step_advantage.transpose(),#, G.transpose() # FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose()# R.transpose()# (we left out the last element) if you add to the advantage the value you get the target for your value function training. Check onenote in cmu_gym/Next Steps # } #(MINE) Probably we combine all experiences from every worker below full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) # Below comment it out for LSTM full_input = {k: combine_first_dimensions(v) for k, v in full_input.items()} # The function takes in nsteps x nenvs x [dims] and combines nsteps*nenvs x [dims]. if not self.do_training: pass elif self.agent.mode == ACMode.A2C: self.agent.train(full_input) elif self.agent.mode == ACMode.PPO: for epoch in range(self.ppo_par.n_epochs): self._train_ppo_epoch(full_input) self.agent.update_theta() self.latest_obs = latest_obs self.batch_counter += 1 # It is used only for printing reasons as the outer while loop takes care to stop the number of batches print('Batch %d finished' % self.batch_counter) sys.stdout.flush()