def main(): # try: parse_cmd_args() sess = tf.Session() K.set_session(sess) db = Database() env = Environment(db, argus) actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'], size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem']) num_trials = argus['num_trial'] # ? # trial_len = 500 # ? # ntp env.preheat() # First iteration cur_state = env._get_obs() # np.array (inner_metric + sql) cur_state = cur_state.reshape((1, env.state.shape[0])) # action = env.action_space.sample() action = env.fetch_action() # np.array action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape-") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state for i in range(num_trials): # env.render() cur_state = cur_state.reshape((1, env.state.shape[0])) action, isPredicted = actor_critic.act(cur_state) print(action) action_2 = action.reshape((1, env.action_space.shape[0])) # for memory # action.tolist() # to execute new_state, reward, done, _ = env.step(action, isPredicted, i + 1) new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("%d-shape-" % i) print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() cur_state = new_state '''
early_stop = False #init = tf.global_variables_initializer() with tf.Session() as sess: writer = tf.summary.FileWriter('./log/train', sess.graph) sess.run(tf.global_variables_initializer()) while not early_stop: log_probs, values, states, actions, rewards, masks = [], [], [], [], [], [] for q in range( PPO_STEPS ): #each ppo steps generates actions, states, rewards print("PPO_steps:{}".format(q)) action, value, norm_dist = model.act(state) next_state, reward, done, _ = env.step(action) # each state, reward, done is a list of results from each parallel environment if render: env.render() log_prob_ = norm_dist.log_prob(action) log_probs.append(log_prob_) values.append(value) states.append(state) actions.append(action) rewards.append(reward) masks.append(1 - done) #storing state = next_state frame_idx += 1
action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, socre, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state predicted_rewardList = [] for epoch in range(num_trials): # env.render() cur_state = cur_state.reshape((1, env.state.shape[0])) action, isPredicted = actor_critic.act(cur_state) print(action) action_2 = action.reshape((1, env.action_space.shape[0])) # for memory # action.tolist() # to execute new_state, reward, done, score, _ = env.step(action, isPredicted, epoch + 1) new_state = new_state.reshape((1, env.state.shape[0])) if isPredicted == 1: predicted_rewardList.append([epoch, reward]) reward_np = np.array([reward]) print("%d-shape" % epoch) print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train()
def train(self): self.NUM_AGENTS = 1 # self.NUM_AGENTS = len(dict_model) # print("train", dict_model) # actor_critics = [] # local_brains = [] # rollouts = [] if DEBUG: print(self.config) actor_critic = ActorCritic(self.n_in, self.n_out) global_brain = Brain(actor_critic, self.config) rollout = RolloutStorage(self.NUM_ADVANCED_STEP, self.NUM_PARALLEL, self.obs_shape, self.device) current_obs = torch.zeros(self.NUM_PARALLEL, self.obs_shape).to(self.device) episode_rewards = torch.zeros([self.NUM_PARALLEL, 1]) final_rewards = torch.zeros([self.NUM_PARALLEL, 1]) episode = np.zeros(self.NUM_PARALLEL) obs = self.envs.reset() obs = np.array(obs) obs = torch.from_numpy(obs).float() current_obs = obs rollout.observations[0].copy_(current_obs) while True: # for step in range(self.NUM_ADVANCED_STEP): for step in range(self.max_step): print("step", step) with torch.no_grad(): # action = actor_critic.act(rollouts.observations[step]) # ここでアクション決めて action = torch.zeros(self.NUM_PARALLEL, self.NUM_AGENTS).long().to( self.device) # 各観測に対する,各エージェントの行動 if DEBUG: print("actionサイズ", self.NUM_PARALLEL, self.NUM_AGENTS) # for i, (k,v) in enumerate( dict_model.items() ): # if k == training_target: # tmp_action = v.act(current_obs) # target_action = copy.deepcopy(tmp_action) # else: # tmp_action = v.act_greedy(current_obs) # action[:,i] = tmp_action.squeeze() action = actor_critic.act(obs) if DEBUG: print("action", action) if DEBUG: print("step前のここ?", action.shape) obs, reward, done, infos = self.envs.step(action) # これで時間を進める print("reward(train)", reward) episode_rewards += reward # if done then clean the history of observation masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if DEBUG: print("done.shape", done.shape) if DEBUG: print("masks.shape", masks.shape) if DEBUG: print("obs.shape", obs.shape) with open(self.resdir + "/episode_reward.txt", "a") as f: for i, info in enumerate(infos): if 'episode' in info: f.write("{:}\t{:}\t{:}\n".format( episode[i], info['env_id'], info['episode']['r'])) print(episode[i], info['env_id'], info['episode']['r']) episode[i] += 1 final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks current_obs *= masks current_obs = obs # ここで観測を更新している rollout.insert(current_obs, action.data, reward, masks, self.NUM_ADVANCED_STEP) with open(self.resdir + "/reward_log.txt", "a") as f: # このログはエピソードが終わったときだけでいい->要修正 f.write("{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\n".format( episode.mean(), step, reward.max().numpy(), reward.min().numpy(), reward.mean().numpy(), episode_rewards.max().numpy(), episode_rewards.min().numpy(), episode_rewards.mean().numpy())) print(episode.mean(), step, reward.mean().numpy(), episode_rewards.mean().numpy()) with torch.no_grad(): next_value = actor_critic.get_value( rollout.observations[-1]).detach() rollout.compute_returns(next_value, self.gamma) value_loss, action_loss, total_loss, entropy = global_brain.update( rollout) with open(self.resdir + "/loss_log.txt", "a") as f: f.write("{:}\t{:}\t{:}\t{:}\t{:}\n".format( episode.mean(), value_loss, action_loss, entropy, total_loss)) print( "value_loss {:.4f}\taction_loss {:.4f}\tentropy {:.4f}\ttotal_loss {:.4f}" .format(value_loss, action_loss, entropy, total_loss)) rollout.after_update() if int(episode.mean()) + 1 > self.NUM_EPISODES: # print("ループ抜ける") break obs = self.envs.reset() if self.args.save: save_model(actor_critic, self.resdir + "/model") # ここでベストなモデルを保存していた(備忘) # print("%s番目のエージェントのtrain終了"%training_target) # dict_model[training_target] = actor_critic # {} return actor_critic
class PPO: def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) try: self.policy.load_state_dict( torch.load('./PPO_continuous_drone.pth', map_location=device)) self.policy_old.load_state_dict( torch.load('./PPO_continuous_old_drone.pth', map_location=device)) print('Saved models loaded') except: print('New models generated') pass self.MseLoss = nn.MSELoss() def select_action(self, state, memory): state = torch.FloatTensor(state.reshape(1, -1)).to(device) return self.policy_old.act(state, memory).cpu().data.numpy().flatten() def update(self, memory): # Monte Carlo estimate of rewards: rewards = [] discounted_reward = 0 for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) rewards.insert(0, discounted_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list to tensor old_states = torch.squeeze(torch.stack(memory.states).to(device), 1).detach() old_actions = torch.squeeze(torch.stack(memory.actions).to(device), 1).detach() old_logprobs = torch.squeeze(torch.stack(memory.logprobs), 1).to(device).detach() # Optimize policy for K epochs: for _ in range(self.K_epochs): # Evaluating old actions and values : logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss( state_values, rewards) - 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())