class Trainer(QtDisplay): """ This class inherits all the UI elements from QtDisplay Contains the PPO training algorithm """ def __init__(self): """ Notes: Create the dependencies with the settings file to make reinitialization possible """ super(Trainer, self).__init__() self.setWindowTitle('OpenAI gym GUI') self.updateTimer = None self.startTimer = time.time() self.images = [] # Placeholders self.scaler = None self.env = None self.obs = None self.actions = None self.valueFunction = None self.policy = None self.trajectories = [] self.policyLoss = None self.episode = None self.mean_reward = None self.sums = None self.mean_actions = None self.done = True self.observes, self.actions, self.rewards, self.unscaled_obs = None, None, None, None self.step = 0 self.obs = 0 self.discrete = False def initializeEnv(self): """ Initializes the actor and critic neural networks and variables related to training Can be called to reinitialize the network to it's original state """ # Set random seed self.statusBox.setText('Creating environment...') s = self.parameters['Learning']['random_seed'] from random import seed if s != 0: seed(s) tf.random.set_random_seed(s) # Create environment envName = self.envSelectionDropdown.currentText().strip() try: self.env = gym.make(envName) except: import rl from rl.baselines import get_parameters config = get_parameters(envName) self.env = getattr(rl.environments, envName)(config=config) # Show screen try: self.env.render(mode="human") except: pass self.env.reset() self.done = False self.gamma = self.parameters['Learning']['gamma'] self.lam = self.parameters['Learning']['lambda'] self.policy_logvar = self.parameters['Learning']['log_variance'] self.trajectories = [] self.obs = self.env.observation_space.shape[0] try: self.actions = self.env.action_space.shape[0] self.actionWidget.setYRange(self.env.action_space.low[0] - .4, self.env.action_space.high[0] + .4) except: self.actions = self.env.action_space.n self.discrete = True # Create the list of deques that is used for averaging out the outputs of the actor network # during training of the network self.testAction = [deque(maxlen=5) for _ in range(self.actions)] self.valueFunction = NNValueFunction(self.obs, self.actions, self.parameters['Learning'], self.parameters['Networks']) self.policy = Policy(self.obs, self.actions, self.parameters['Learning'], self.parameters['Networks'], self.policy_logvar) self.policyLoss = [0] self.episode = 0 self.mean_reward = [] self.sums = 0.0 self.mean_actions = np.zeros( [self.parameters['Learning']['batch_size'], 3]) self.scaler = Scaler(self.env.observation_space.shape[0]) self.observes, self.rewards, self.unscaled_obs = None, None, None self.step = 0 self.statusBox.setText('Created {} environment.'.format(envName)) self.buttonStatus('initialized') def test(self): """ The test loop that uses the current policy and shows the performance step-wise """ if self.done or self.testState is None: self.testState = self.env.reset() scale, offset = self.scaler.get() obs = self.testState.astype(np.float32).reshape((1, -1)) obs = (obs - offset) * scale action = self.policy.sample(obs).reshape((1, -1)).astype(np.float32) self.testState, _, self.done, _ = self.env.step( np.squeeze(action, axis=0)) if 'EnvSpec' in str(self.env.spec): if self.recording: image = self.env.render('rgb_array') self.images.append(image) print('Recording...') else: self.env.render() else: try: self.updateImage() except: pass # Update the action plot action = list(np.squeeze(action, axis=0)) for i in range(self.actions): self.testAction[i].append(action[i]) self.updateActions( [np.mean(self.testAction[x]) for x in range(self.actions)]) def train(self): """ The training loop for running a single episode All the data gathered is stored in class attributes The updates are performed after collecting a full batch of experiences """ if self.updateTimer is None: self.updateTimer = time.time() obs = self.env.reset() while True: if obs.shape[0] != self.env.observation_space.shape[0]: obs = self.env.reset() else: break observes, actions, rewards, unscaled_obs = [], [], [], [] done = False step = 0.0 scale, offset = self.scaler.get() while not done: obs = obs.astype(np.float32).reshape((1, -1)) unscaled_obs.append(obs) obs = (obs - offset) * scale observes.append(obs) action = self.policy.sample(obs).reshape( (1, -1)).astype(np.float32) actions.append(action) obs, reward, done, _ = self.env.step(np.squeeze(action, axis=0)) if not isinstance(reward, float): reward = np.asscalar(np.asarray(reward)) rewards.append(reward) step += 1e-3 # Trajectories # batch_step = self.episode % self.parameters['Learning']['batch_size'] self.episode += 1 trajectory = { 'observes': np.concatenate(observes), 'actions': np.concatenate(actions), 'rewards': np.array(rewards, dtype=np.float64), 'unscaled_obs': np.concatenate(unscaled_obs) } self.trajectories.append(trajectory) unscaled = np.concatenate([ t['unscaled_obs'] for t in self.trajectories[:(self.episode - 1) % self.parameters['Learning']['batch_size'] + 1] ]) self.scaler.update(unscaled) # Reward updates means = np.sum(rewards) self.sums += means / self.parameters['Learning']['batch_size'] # Network updating procedure if self.episode > 0 and self.episode % self.parameters['Learning'][ 'batch_size'] == 0: self.statusBox.setText('Updating policy network...') self.add_value( self.trajectories, self.valueFunction) # Add estimated values to episodes self.add_disc_sum_rew( self.trajectories, self.gamma) # Calculated discounted sum of Rs self.add_gae(self.trajectories, self.gamma, self.lam) # Calculate advantage # concatenate all episodes into single np arrays observes, actions, advantages, disc_sum_rew = self.build_train_set( self.trajectories) loss = self.policy.update(observes, actions, advantages) # Update policy self.valueFunction.fit(observes, disc_sum_rew) # Update value function self.trajectories = [] self.policyLoss.append(loss) # Update the policy loss widget # Reward plots self.mean_reward.append(self.sums) nSteps = np.shape(observes)[0] print( "Updating... Batch size: {}, Steps/s: {}, Learning rates (a / c): {}, {}" .format(nSteps, nSteps / (time.time() - self.updateTimer), self.parameters['Learning']['lr_actor'], self.parameters['Learning']['lr_critic'])) self.updateReward(self.mean_reward) self.updateLoss(self.policyLoss) # Reset self.updateTimer = time.time() self.sums = 0 self.mean_actions = np.zeros( [self.parameters['Learning']['batch_size'], 3]) # Draw images if self.parameters['Rendering']['draw_images']: if self.episode % self.parameters['Rendering']['draw_every'] == 0: if 'EnvSpec' in str(self.env.spec): self.env.render() else: try: self.updateImage() except: pass # Print out summary if self.parameters['Rendering']['print_stats']: print('Episode: {}, Reward: {:.1f}, Steps: {}'.format( self.episode, means, int(step * 1000))) self.statusBox.setText('Episode {}/{}, Reward: {}'.format( self.episode % self.parameters['Learning']['batch_size'] + 1, self.parameters['Learning']['batch_size'], self.sums)) def terminate(self): """ Ends the training, closes tensorflow sessions """ self.policy.close_sess() self.valueFunction.close_sess() def discount(self, x, gamma): """ Calculate discounted forward sum of a sequence at each point """ return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] def add_disc_sum_rew(self, trajectories, gamma): """ Adds discounted sum of rewards to all time steps of all trajectories """ for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] disc_sum_rew = self.discount(rewards, gamma) trajectory['disc_sum_rew'] = disc_sum_rew def add_value(self, trajectories, valueFunctio): """ Adds estimated value to all time steps of all trajectories """ for trajectory in trajectories: observes = trajectory['observes'] values = self.valueFunction.predict(observes) trajectory['values'] = values def add_gae(self, trajectories, gamma, lam): """ Add generalized advantage estimator. """ for trajectory in trajectories: if gamma < 0.999: # don't scale for gamma ~= 1 rewards = trajectory['rewards'] * (1 - gamma) else: rewards = trajectory['rewards'] values = trajectory['values'] # temporal differences tds = rewards - values + np.append(values[1:] * gamma, 0) advantages = self.discount(tds, gamma * lam) trajectory['advantages'] = advantages def build_train_set(self, trajectories): """ Concatenate the lists of dictionaries into arrays """ observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate( [t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) # normalize advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew