global_step.assign_add(1) print('Episode: ', episode, ' entropy: ', data[0] / float(episode_step), ' reward', data[1], ' global_step: ', max_step, ' episode_step: ', episode_step) with writer.as_default(), tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar("entropy", data[0] / float(episode_step)) tf.contrib.summary.scalar("reward", data[1]) tf.contrib.summary.scalar("episode_step", episode_step) envs = env_wrapper.EnvWrapper(ENV_GAME_NAME, ACTORS, update_obs=_process_obs, update_reward=_clip_reward, end_episode=_end_episode) rollouts = [rollout.Rollout() for _ in range(ACTORS)] global ep_ave_max_q_value ep_ave_max_q_value = 0 global total_reward total_reward = 0 def create_tensorboard(): global_step = tf.train.get_or_create_global_step() logdir = "./logs/" writer = tf.contrib.summary.create_file_writer(logdir) writer.set_as_default() return global_step, writer
def generate_episode(self): """ Generate and save a single rollout with the current policy Updates: self.rollouts Calls: _normalize _denormalize gym.environment.reset gym.environment.step gym.environment.render gym.environment.close :return: None """ norm_prev_obs = self.env.reset() # Normalize the observations for i in range(0, np.shape(self.state_dimensions)[0]): norm_prev_obs[i] = self._normalize(norm_prev_obs[i], self.state_dimensions[i][0], self.state_dimensions[i][1]) norm_states = () norm_actions = () norm_next_states = () rewards = () done = False steps = 0 while (not done) & (steps < MAX_EPISODE_LENGTH): if self.actor is None: # If you haven't trained an actor explore with a set gaussian == N(0,1) action = self._denormalize( np.clip(np.random.normal(0, 1), -1, 1), self.action_dimension[0], self.action_dimension[1]) else: action = self._denormalize(self.actor.act(norm_prev_obs), self.action_dimension[0], self.action_dimension[1]) norm_obs, reward, done, info = self.env.step(np.array([action])) # print(done) # Normalize the observations for i in range(0, np.shape(self.state_dimensions)[0]): norm_obs[i] = self._normalize(norm_obs[i], self.state_dimensions[i][0], self.state_dimensions[i][1]) norm_states += (norm_prev_obs, ) norm_actions += (self._normalize(action, self.action_dimension[0], self.action_dimension[1]), ) norm_next_states += (norm_obs, ) rewards += (reward, ) norm_prev_obs = norm_obs self.env.render() steps += 1 self.env.close() number_of_samples = np.shape(norm_states)[0] # Reshape arrays to catch shape of (n,) and replace it with (n,1) rollout = rl.Rollout( np.array(np.reshape(norm_states, (number_of_samples, -1))), np.array(np.reshape(norm_actions, (number_of_samples, -1))), np.array(np.reshape(norm_next_states, (number_of_samples, -1))), np.array(np.reshape(rewards, (number_of_samples, -1)))) self.rollouts += (rollout, )
def main(arglist): envs = env_wrapper.EnvWrapper(arglist.scenario, arglist.actors, arglist.saved_episode, end_episode=_end_episode) rollouts = [roll.Rollout() for _ in range(arglist.actors)] #envs.observation_shape[0] #actorCriticOld = agent.ActorCritic(envs.action_shape, arglist.learning_rate, arglist.epsilon, arglist.final_step, envs.observation_shape, envs.continious, envs.upper_bound).to(device) actorCriticOld = agent.ActorCriticCNN(envs.action_shape, arglist.learning_rate, arglist.epsilon, arglist.final_step, [1, 41, 41], envs.continious, envs.upper_bound).to(device) try: actorCriticOld.load_model('./saved_actor/' + arglist.scenario + '_' + str(arglist.load_episode_saved)) print('Success to load !') except Exception as e: print('FAILED TO LOAD.') if not arglist.eval: envs.set_wall_size(5) if not arglist.eval: envs.render() t = 0 batch_obs = envs.reset() batch_stack = [] for obs in batch_obs: stack = np.array(obs) #np.concatenate([obs, obs], axis=1) batch_stack.append(stack) while t < arglist.final_step: if not arglist.eval: time.sleep(0.05) actions_t = [] values_t = [] dist_cat_t = [] entropy_t = [] for stack in batch_stack: state = torch.FloatTensor(stack).to(device) dist, value = actorCriticOld(state) action = dist.sample()[ 0] #============================================> if envs.continious: action = action * envs.upper_bound entropy_t.append(dist.entropy().mean()) actions_t.append(action.cpu().numpy()) #FIX dist_cat_t.append(dist.log_prob(action).cpu().detach().numpy()[0]) values_t.append( value.cpu().detach().numpy()[0] [0]) #[0] FIX ============================================> obs2s_t, rewards_t, dones_t = envs.step(actions_t) for i in range(arglist.actors): data = envs.get_variables_at_index(i) if len(data) < 4: data = [0, 0, 0, 0] envs.add_variables_at_index(i, [ entropy_t[i].cpu().detach().numpy() + data[0], rewards_t[i] + data[1], actorCriticOld.learning_rate_decay, actorCriticOld.clip_param ]) if t > 0 and (t / arglist.actors ) % arglist.time_horizon == 0 and arglist.eval: next_values = np.reshape(values_t, [-1]) train(next_values, actorCriticOld, rollouts, arglist) if arglist.eval: if envs.can_saved: actorCriticOld.save_model('./saved_actor/' + arglist.scenario + '_' + str(envs.episode)) if arglist.eval: for i, rollout in enumerate(rollouts): rollout.add(batch_stack[i][0, :], actions_t[i], rewards_t[i], values_t[i], dist_cat_t[i], 1 - dones_t[i]) t += arglist.actors for i, stack in enumerate(batch_stack): #stack = stack[:,1:,:,:] batch_stack[i] = np.array( obs2s_t[i]) #np.concatenate([stack, obs2s_t[i]], axis=1) if arglist.learning_rate_decay == 'linear' and arglist.eval: progress = t / arglist.final_step actorCriticOld.decay_clip_param(progress) actorCriticOld.decay_learning_rate(progress)
def main(arglist): envs = env_wrapper.EnvWrapper(arglist.scenario, arglist.actors, update_obs=_process_obs, update_reward=_clip_reward, end_episode=_end_episode) rollouts = [roll.Rollout() for _ in range(arglist.actors)] actorCritic = ppo.ActorCritic(envs.action_shape, arglist.learning_rate, arglist.epsilon, arglist.final_step, envs.observation_shape) actorCriticOld = ppo.ActorCritic(envs.action_shape, arglist.learning_rate, arglist.epsilon, arglist.final_step, envs.observation_shape) print(envs.observation_shape) try: actorCriticOld.load('./saved_1100/actor.h5') except Exception as e: print('failed to load') t = 0 batch_obs = envs.reset() update_episode = 0 while True: if not TRAIN_MODE: envs.render(0) actions_t = [] dists_t = [] values_t = [] dist_cat_t = [] entropy_t = [] for stack in batch_obs: dist, value = actorCriticOld.model.predict(stack) distCat = tf.distributions.Categorical(probs=tf.nn.softmax(dist)) action = distCat.sample(1)[0] entropy_t.append(distCat.entropy()) actions_t.append(action) dists_t.append(dist) dist_cat_t.append(distCat) values_t.append(value) obs2s_t, rewards_t, dones_t = envs.step(actions_t) for i in range(arglist.actors): data = envs.get_variables_at_index(i) if len(data) < 2: data = [0, 0] envs.add_variables_at_index(i, [np.mean(entropy_t[i]) + data[0], rewards_t[i] + data[1]]) if t > 0 and (t / arglist.actors) % arglist.time_horizon == 0 and TRAIN_MODE: next_values = np.reshape(values_t, [-1]) train(next_values, actorCritic, actorCriticOld, rollouts, arglist) #if update_episode % 50 == 0: actorCritic.save() #update_episode += 1 if TRAIN_MODE: for i, rollout in enumerate(rollouts): log_prob = dist_cat_t[i].log_prob(actions_t[i]) rollout.add(batch_obs[i][0,:], actions_t[i][0], rewards_t[i], values_t[i][0][0], log_prob[0], 1 - dones_t[i]) t += arglist.actors for i, stack in enumerate(batch_obs): batch_obs[i] = obs2s_t[i] if arglist.learning_rate_decay == 'linear': actorCritic.decay_clip_param(t) actorCritic.decay_learning_rate(t)