def get_action(self, state): a = np.zeros(self.env.num_stations) fs = np.reshape(featurize_cont(state, dow_one_hot=True), (1, -1)) out = self.model.predict(fs) idxs = np.cumsum([0] + [self.env.config.NUM_POWER_STEPS] * self.env.num_stations) if np.random.rand() < 0.997**self.epsilon: for i, station in enumerate(state['stations']): if station['is_car'] and station['per_char'] < 1.0: a[i] = np.random.choice(self.env.actions[i]) else: for i, station in enumerate(state['stations']): if station['is_car'] and station['per_char'] < 1.0: a_idx = np.argmax(out[0][idxs[i]:idxs[i + 1]]) a[i] = self.env.actions[i][a_idx] return self.rev_action_map[tuple(a)]
def train(self): """ Performs training """ last_eval = 0 last_record = 0 scores_eval = [] # list of scores computed at iteration time self.init_averages() for t in range(self.config.num_batches): self.total_train_steps += 1 last_record += 1 # collect a minibatch of samples paths, total_rewards = self.sample_gameplay( self.env, max_ep_len=self.config.max_ep_len, ) scores_eval = scores_eval + total_rewards observations = np.concatenate( [path["observation"] for path in paths]) observations_p = np.copy(observations)[1:] observations = observations[:-1] actions = np.concatenate([path["action"] for path in paths])[:-1] rewards = np.concatenate([path["reward"] for path in paths])[:-1] # run training operations fs_list = [ featurize_cont(s, dow_one_hot=True) for s in observations ] fsp_list = [ featurize_cont(sp, dow_one_hot=True) for sp in observations_p ] pred_fs = self.model.predict(fs_list) pred_fsp = self.model.predict(fsp_list) pred_fs2 = self.model2.predict(fs_list) pred_fsp2 = self.model2.predict(fsp_list) masks = np.zeros((len(actions), np.prod([self.env.config.NUM_POWER_STEPS] * self.env.num_stations))) for i, a in enumerate(actions): masks[i, a] = 1 masks_p = np.vstack( (np.copy(masks)[1:, :], np.copy(masks)[-1:, :])) if np.random.rand() < 0.5: self.model.partial_fit( fs_list, pred_fs + self.lr * masks * ((np.array(rewards) + self.gamma * np.sum(pred_fsp2 * masks_p, axis=1)).reshape(-1, 1) - pred_fs)) else: self.model2.partial_fit( fs_list, pred_fs2 + self.lr * masks * ((np.array(rewards) + self.gamma * np.sum(pred_fsp * masks_p, axis=1)).reshape( -1, 1) - pred_fs2)) # tf stuff if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, scores_eval) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt( np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:9.1f} +/- {:.2f}; batch {}/{}; epsilon={:.2f}".format( avg_reward, sigma_reward, t, self.config.num_batches, 0.997**self.epsilon) self.logger.info(msg) if self.config.record and (last_record >= self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.epsilon += 1 self.logger.info("- Training done.")