traintest = Dataloader( "/home/zhangjian/workspace/dataset/NUS-WIDE/resize64-rf-noncrop/train", 0, 500, 1, 22, 'nus') flag = False if dataset == 'flk': traintest = Dataloader("None", 0, 500, 1, 1, 'flk') flag = False if flag: print('undefined_dataset') quit() ###model model = ActorCritic(bit_len, batch_size) model.cuda() print('model over') ###train episode_length = 1 while True: if episode_length % steps == 0: model.low_lr(rate) if (episode_length % 1000 == 0) and (episode_length > 20000): if dataset == 'cifar': model.eval() map = test_util.test(Dtest, model, batch_size, bit_len) file = open(logpath, "a")
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) # CUDA if args.use_cuda: torch.cuda.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) gpu_id = 0 if args.use_cuda else -1 # todo 0 代表第一个显卡 if gpu_id >= 0: model = model.cuda() model.train() if not args.on_policy: # Normalise memory capacity by number of training processes memory = EpisodicReplayMemory( args.memory_capacity // args.num_processes, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: avg_hx = torch.zeros(1, args.hidden_size) avg_cx = torch.zeros(1, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(1, args.hidden_size).cuda() cx = torch.zeros(1, args.hidden_size).cuda() else: hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) if gpu_id >= 0: state = state.cuda() done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) # shared 模型在 CPU上, 需要转换 if gpu_id >= 0: to_avg_state = state.cpu() else: to_avg_state = state average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( to_avg_state, (avg_hx, avg_cx)) # if gpu_id >= 0: # average_policies = average_policies.cuda() # Sample action action = torch.multinomial(policy, 1)[0, 0] # Step next_state, reward, done, _ = env.step(action.item()) next_state = state_to_tensor(next_state) if gpu_id >= 0: next_state = next_state.cuda() reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(state, action, reward, policy.detach()) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies), (policy, Q, V, torch.LongTensor([[action]]), torch.Tensor([[reward]]), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = torch.zeros(1, 1) if not args.on_policy: # Save terminal state for offline training memory.append(state, None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(state, (hx, cx)) Qret = Qret.detach().cpu() # Train the network on-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state avg_hx = torch.zeros(args.batch_size, args.hidden_size) avg_cx = torch.zeros(args.batch_size, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(args.batch_size, args.hidden_size).cuda() cx = torch.zeros(args.batch_size, args.hidden_size).cuda() else: hx = torch.zeros(args.batch_size, args.hidden_size) cx = torch.zeros(args.batch_size, args.hidden_size) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i]), 0) action = torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ]).unsqueeze(1) reward = torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ]).unsqueeze(1) old_policy = torch.cat( tuple(trajectory.policy for trajectory in trajectories[i]), 0) # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( state, (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i + 1]), 0) done = torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1) # Do forward pass for all transitions _, _, Qret, _ = model(next_state, (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach().cpu() # Train the network off-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
def test_multi(testing_scene, rank, shared_model, results, config, arguments=dict()): torch.manual_seed(arguments['seed'] + rank) env = MultiSceneEnv(testing_scene, config, arguments, arguments['seed'] + rank) # gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] gpu_id = -1 print("Done initalizing process {}: {}! Use gpu: {}".format( rank, testing_scene, 'yes' if gpu_id >= 0 else 'no')) if shared_model is not None: # gpu_id = -1 model = ActorCritic(config, arguments, gpu_id) if gpu_id >= 0: with torch.cuda.device(gpu_id): model = model.cuda() model.load_state_dict(shared_model.state_dict()) # print("[P{}] loaded model into cuda {}".format(rank, gpu_id)) else: model.load_state_dict(shared_model.state_dict()) # print("[P{}] loaded model".format(rank)) model.eval() else: model = None state, score, target = env.reset() done = True for ep in range(1000): state, score, target = env.reset() agent_step = 0 starting = env.current_state_id for step in range(arguments['num_iters']): if model is not None: with torch.no_grad(): value, logit = model(state, score, target) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].cpu().numpy() # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0] else: action = np.random.choice(range(arguments['action_size'])) state, score, reward, done = env.step(action) ending = env.current_state_id if action < 2: agent_step += 1 if done: break if not done: tm = results[target] tm.append(0) results[target] = tm else: if max(agent_step, env.shortest[ending, starting]) > 0: tm = results[target] tm.append(env.shortest[ending, starting] / max(agent_step, env.shortest[ending, starting])) results[target] = tm
default=2, help='number of non sampling processes (default: 2)') mp = _mp.get_context('spawn') print("Cuda: " + str(torch.cuda.is_available())) if __name__ == '__main__': os.environ['OMP_NUM_THREADS'] = '1' args = parser.parse_args() env = setup_env(args.env_name) shared_model = ActorCritic(1, env.action_space.n) if args.use_cuda: shared_model.cuda() shared_model.share_memory() if os.path.isfile(args.save_path): print('Loading A3C parametets ...') shared_model.load_state_dict(torch.load(args.save_path)) optimizer = SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() print("No of available cores : {}".format(mp.cpu_count())) processes = [] counter = mp.Value('i', 0)
def train(rank, args, shared_model, counter, lock, optimizer=None, select_sample=True): FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if args.use_cuda else torch.LongTensor env = setup_env(args.env_name) model = ActorCritic(1, env.action_space.n) if args.use_cuda: model.cuda() if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = prepro(env.reset()) state = torch.from_numpy(state) done = True episode_length = 0 for num_iter in count(): if rank == 0: if num_iter % args.save_interval == 0 and num_iter > 0: #print ("Saving model at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path) if num_iter % ( args.save_interval * 2.5 ) == 0 and num_iter > 0 and rank == 1: # Second saver in-case first processes crashes #print ("Saving model for process 1 at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path) model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 512)).type(FloatTensor) hx = Variable(torch.zeros(1, 512)).type(FloatTensor) else: cx = Variable(cx.data).type(FloatTensor) hx = Variable(hx.data).type(FloatTensor) values, log_probs, rewards, entropies = [], [], [], [] actions, forwards, vec_st1s, inverses = [], [], [], [] for step in range(args.num_steps): episode_length += 1 state_inp = Variable(state.unsqueeze(0)).type(FloatTensor) value, logit, (hx, cx) = model((state_inp, (hx, cx)), False) s_t = state prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(-1, keepdim=True) entropies.append(entropy) if select_sample: action = prob.multinomial(num_samples=1).data else: action = prob.max(-1, keepdim=True)[1].data log_prob = log_prob.gather(-1, Variable(action)) action_out = action.to(torch.device("cpu")) oh_action = torch.Tensor(1, env.action_space.n).type(LongTensor) oh_action.zero_() oh_action.scatter_(1, action, 1) a_t = oh_action.type(FloatTensor) #print ('action', a_t) state, reward, done, _ = env.step(action_out.numpy()[0][0]) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) #print ('extrinsic reward', reward) state = torch.from_numpy(prepro(state)) s_t1 = state vec_st1, inverse, forward = model( (Variable(s_t.unsqueeze(0)).type(FloatTensor), Variable(s_t1.unsqueeze(0)).type(FloatTensor), a_t), True) reward_intrinsic = args.eta * ( (vec_st1 - forward).pow(2)).sum(1) / 2. reward_intrinsic = reward_intrinsic.to(torch.device("cpu")) #print('intrinsic reward', reward_intrinsic) reward += reward_intrinsic reward1 = reward_intrinsic #print('total_reward', reward) with lock: counter.value += 1 if done: episode_length = 0 state = torch.from_numpy(prepro(env.reset())) values.append(value) log_probs.append(log_prob) reward1 = reward1.type(FloatTensor) rewards.append(reward1) forwards.append(forward) vec_st1s.append(vec_st1) inverses.append(inverse) actions.append(a_t) if done: break R = torch.zeros(1, 1) if not done: state_inp = Variable(state.unsqueeze(0)).type(FloatTensor) value, _, _ = model((state_inp, (hx, cx)), False) R = value.data values.append(Variable(R).type(FloatTensor)) policy_loss = 0 value_loss = 0 forward_loss = 0 inverse_loss = 0 R = Variable(R).type(FloatTensor) gae = torch.zeros(1, 1).type(FloatTensor) #print (rewards) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae).type(FloatTensor) - args.entropy_coef * entropies[i] forward_err = forwards[i] - vec_st1s[i] forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1) cross_entropy = -(actions[i] * torch.log(inverses[i] + 1e-15)).sum(1) inverse_loss = inverse_loss + cross_entropy #print ('forward loss', forward_loss) #print ('inverse loss', inverse_loss) #print ('other loss', (policy_loss + args.value_loss_coef * value_loss)) optimizer.zero_grad() ((1 - args.beta) * inverse_loss + args.beta * forward_loss).backward(retain_graph=True) (args.lmbda * (policy_loss + 0.5 * value_loss)).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def test(testing_scene, test_object, rank, shared_model, results, config, arguments=dict()): torch.manual_seed(arguments['seed'] + rank) env = AI2ThorDumpEnv(testing_scene, test_object, config, arguments, arguments['seed'] + rank) print("Finding {} in {}, {}".format(test_object, testing_scene, env.target_locs)) if shared_model is not None: gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] # gpu_id = -1 model = ActorCritic(config, arguments, gpu_id) if gpu_id >= 0: with torch.cuda.device(gpu_id): model = model.cuda() model.load_state_dict(shared_model.state_dict()) print("[P{}] loaded model into cuda {}".format(rank, gpu_id)) else: model.load_state_dict(shared_model.state_dict()) print("[P{}] loaded model".format(rank)) model.eval() state, score, target = env.reset() done = True starting = env.current_state_id results[rank] = 0 for ep in range(1000): agent_step = 0 for step in range(arguments['num_iters']): if model is not None: with torch.no_grad(): value, logit = model(state, score, target) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].cpu().numpy() # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0] else: action = np.random.choice(range(arguments['action_size'])) state, score, reward, done = env.step(action) ending = env.current_state_id if action < 2: agent_step += 1 if done: results[rank] += env.shortest[ending, starting] / max( agent_step, env.shortest[ending, starting]) state, score, target = env.reset() break results[rank] = results[rank] / 1000
def train(rank, args, shared_model, counter, lock, optimizer=None, select_sample=True): FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor env = setup_env(args.env_name) model = ActorCritic(1, env.action_space.n) if args.use_cuda: model.cuda() if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = prepro(env.reset()) state = torch.from_numpy(state) done = True episode_length = 0 for num_iter in count(): if rank == 0: if num_iter % args.save_interval == 0 and num_iter > 0: #print ("Saving model at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path) if num_iter % ( args.save_interval * 2.5 ) == 0 and num_iter > 0 and rank == 1: # Second saver in-case first processes crashes #print ("Saving model for process 1 at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path) model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 512)).type(FloatTensor) hx = Variable(torch.zeros(1, 512)).type(FloatTensor) else: cx = Variable(cx.data).type(FloatTensor) hx = Variable(hx.data).type(FloatTensor) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 state_inp = Variable(state.unsqueeze(0)).type(FloatTensor) value, logit, (hx, cx) = model((state_inp, (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(-1, keepdim=True) entropies.append(entropy) if select_sample: action = prob.multinomial(num_samples=1).data else: action = prob.max(-1, keepdim=True)[1].data log_prob = log_prob.gather(-1, Variable(action)) action_out = action.to(torch.device("cpu")) state, reward, done, _ = env.step(action_out.numpy()[0][0]) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) with lock: counter.value += 1 if done: episode_length = 0 #env.change_level(0) state = torch.from_numpy(prepro(env.reset())) #print ("Process {} has completed.".format(rank)) env.locked_levels = [False] + [True] * 31 state = torch.from_numpy(prepro(state)) values.append(value) log_probs.append(log_prob) rewards.append(0.001 * reward) if done: break R = torch.zeros(1, 1) if not done: state_inp = Variable(state.unsqueeze(0)).type(FloatTensor) value, _, _ = model((state_inp, (hx, cx))) R = value.data values.append(Variable(R).type(FloatTensor)) policy_loss = 0 value_loss = 0 R = Variable(R).type(FloatTensor) gae = torch.zeros(1, 1).type(FloatTensor) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae).type(FloatTensor) - args.entropy_coef * entropies[i] total_loss = policy_loss + args.value_loss_coef * value_loss optimizer.zero_grad() (total_loss).backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model, counter): FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor env = setup_env(args.env_name) model = ActorCritic(1, env.action_space.n) if args.use_cuda: model.cuda() model.eval() state = prepro(env.reset()) state = torch.from_numpy(state) reward_sum = 0 done = True savefile = os.getcwd() + '/save/mario_curves.csv' title = ['Time', 'No. Steps', 'Total Reward', 'Episode Length'] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerow(title) start_time = time.time() actions = deque(maxlen=4000) episode_length = 0 while True: episode_length += 1 ep_start_time = time.time() if done: model.load_state_dict(shared_model.state_dict()) with torch.no_grad(): cx = Variable(torch.zeros(1, 512)).type(FloatTensor) hx = Variable(torch.zeros(1, 512)).type(FloatTensor) else: with torch.no_grad(): cx = Variable(cx.data).type(FloatTensor) hx = Variable(hx.data).type(FloatTensor) with torch.no_grad(): state_inp = Variable(state.unsqueeze(0)).type(FloatTensor) value, logit, (hx, cx) = model((state_inp, (hx, cx)), False) prob = F.softmax(logit, dim=-1) action = prob.max(-1, keepdim=True)[1].data action_out = action.to(torch.device("cpu")) state, reward, done, _ = env.step(action_out.numpy()[0][0]) env.render() done = done or episode_length >= args.max_episode_length reward_sum += reward actions.append(action[0][0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print( "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) data = [ time.time() - ep_start_time, counter.value, reward_sum, episode_length ] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([data]) reward_sum = 0 episode_length = 0 actions.clear() time.sleep(180) state = prepro(env.reset()) state = torch.from_numpy(prepro(state))
def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) actor_critic = ActorCritic( envs.observation_space.shape[0] * args.num_stack, envs.action_space) if args.cuda: actor_critic.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) #optimizer = KFACOptimizer(actor_critic, damping=1e-2, kl_clip=0.01, stat_decay=0.99) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, obs_shape[1], obs_shape[2]) states = torch.zeros(args.num_steps + 1, args.num_processes, *obs_shape) current_state = torch.zeros(args.num_processes, *obs_shape) counts = 0 def update_current_state(state): state = torch.from_numpy(np.stack(state)).float() current_state[:, :-1] = current_state[:, 1:] current_state[:, -1] = state state = envs.reset() update_current_state(state) rewards = torch.zeros(args.num_steps, args.num_processes, 1) returns = torch.zeros(args.num_steps + 1, args.num_processes, 1) actions = torch.LongTensor(args.num_steps, args.num_processes) masks = torch.zeros(args.num_steps, args.num_processes, 1) # These variables are used to compute average rewards for all processes. # Note that rewards are clipped so you need to use a monitor (see envs.py) # to get true rewards. total_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: states = states.cuda() current_state = current_state.cuda() rewards = rewards.cuda() returns = returns.cuda() actions = actions.cuda() masks = masks.cuda() for j in range(num_updates): for step in range(args.num_steps): # Sample actions _, logits = actor_critic(Variable(states[step], volatile=True)) probs = F.softmax(logits) log_probs = F.log_softmax(logits).data actions[step] = probs.multinomial().data cpu_actions = actions[step].cpu() cpu_actions = cpu_actions.numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() total_rewards += reward np_masks = np.array([0.0 if done_ else 1.0 for done_ in done]) # If done then clean the history of observations. pt_masks = torch.from_numpy( np_masks.reshape(np_masks.shape[0], 1, 1, 1)).float() if args.cuda: pt_masks = pt_masks.cuda() current_state *= pt_masks update_current_state(state) states[step + 1].copy_(current_state) rewards[step].copy_(reward) masks[step].copy_(torch.from_numpy(np_masks)) final_rewards *= masks[step].cpu() final_rewards += (1 - masks[step].cpu()) * total_rewards total_rewards *= masks[step].cpu() # Reshape to do in a single forward pass for all steps values, logits = actor_critic( Variable(states.view(-1, *states.size()[-3:]))) log_probs = F.log_softmax(logits) probs = F.softmax(logits) # Unreshape logits_size = (args.num_steps + 1, args.num_processes, logits.size(-1)) log_probs = F.log_softmax(logits).view(logits_size)[:-1] probs = F.softmax(logits).view(logits_size)[:-1] values = values.view(args.num_steps + 1, args.num_processes, 1) logits = logits.view(logits_size)[:-1] action_log_probs = log_probs.gather(2, Variable(actions.unsqueeze(2))) dist_entropy = -(log_probs * probs).sum(-1).mean() returns[-1] = values[-1].data for step in reversed(range(args.num_steps)): returns[step] = returns[step + 1] * \ args.gamma * masks[step] + rewards[step] value_loss = (values[:-1] - Variable(returns[:-1])).pow(2).mean() advantages = returns[:-1] - values[:-1].data action_loss = -(Variable(advantages) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() states[0].copy_(states[-1]) if j % args.log_interval == 0: print( "Updates {}, num frames {}, mean clipped reward {:.5f}, max clipped reward {:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, j * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def train(training_scene, train_object, rank, shared_model, scheduler, counter, lock, config, arguments=dict(), optimizer=None): torch.manual_seed(arguments['seed'] + rank) # To prevent out of memory if (arguments['train_cnn'] and rank < 10): arguments.update({"gpu_ids": [-1]}) gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] if gpu_id >= 0: torch.cuda.manual_seed(arguments['seed'] + rank) if optimizer is None: optimizer = optim.RMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1) env = AI2ThorDumpEnv(training_scene, train_object, config, arguments, seed=arguments['seed'] + rank) state, score, target = env.reset() starting = env.current_state_id done = True print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format( rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no')) model = ActorCritic(config, arguments, gpu_id) if gpu_id >= 0: with torch.cuda.device(gpu_id): model = model.cuda() dtype = torch.cuda.FloatTensor else: dtype = torch.FloatTensor model.train() # monitoring total_reward_for_num_steps_list = [] redundancies = [] success = [] avg_entropies = [] learning_rates = [] dist_to_goal = [] start = time.time() episode_length = 0 for epoch in range(arguments['num_epochs']): # Sync with the shared model if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) if arguments['lstm']: if done: cx = torch.zeros(1, 512).type(dtype) hx = torch.zeros(1, 512).type(dtype) else: cx = cx.detach() hx = hx.detach() if scheduler is not None: scheduler.step() learning_rates.append(optimizer.param_groups[0]['lr']) values = [] log_probs = [] rewards = [] entropies = [] starting = env.current_state_id dist_to_goal.append( min([env.shortest[starting][t] for t in env.target_ids])) for step in range(arguments['num_iters']): episode_length += 1 if arguments['lstm']: value, logit, (hx, cx) = model((state, (hx, cx)), score, target) else: value, logit = model(state, score, target) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) action_int = action.cpu().numpy()[0][0].item() state, score, reward, done = env.step(action_int) if done: success.append(1) elif episode_length >= arguments['max_episode_length']: success.append(0) done = done or episode_length >= arguments['max_episode_length'] with lock: counter.value += 1 values.append(value) log_probs.append(log_prob) rewards.append(reward) ending = env.current_state_id if done: state, score, target = env.reset() print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\ .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600)) episode_length = 0 break if not done: success.append(0) # No interaction with environment below. # Monitoring total_reward_for_num_steps_list.append(sum(rewards)) redundancies.append(step + 1 - env.shortest[ending, starting]) avg_entropies.append(torch.tensor(entropies).numpy().mean()) # Backprop and optimisation R = torch.zeros(1, 1) if not done: # to change last reward to predicted value to .... if arguments['lstm']: value, _, (hx, cx) = model((state, (hx, cx)), score, target) else: value, _ = model(state, score, target) R = value.detach() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() for i in reversed(range(len(rewards))): R = arguments['gamma'] * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) if arguments['use_gae']: # Generalized Advantage Estimation delta_t = rewards[i] + arguments['gamma'] * values[ i + 1] - values[i] gae = gae * arguments['gamma'] * arguments['tau'] + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach() - \ arguments['ec'] * entropies[i] optimizer.zero_grad() (policy_loss + arguments['vc'] * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), arguments['max_grad_norm']) ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0) optimizer.step() if (epoch + 1) % 1000 == 0 and np.mean(success[-500:]) >= 0.8 and \ not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])): torch.save( model.state_dict(), "training-history/{}/net_good.pth".format(arguments['about'])) if (epoch + 1) % 2000 == 0: with open( 'training-history/{}/{}_{}_{}.pkl'.format( arguments['about'], training_scene, train_object, rank), 'wb') as f: pickle.dump( { "rewards": total_reward_for_num_steps_list, "dist_to_goal": dist_to_goal, "success_rate": success, 'redundancies': redundancies, "entropies": avg_entropies, 'lrs': learning_rates }, f, pickle.HIGHEST_PROTOCOL) torch.save( model.state_dict(), "training-history/{}/net_{}.pth".format(arguments['about'], train_object))
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class if params.cuda: model.cuda() state = env.reset() # state is a numpy array of size 1*42*42, in black & white state = torch.from_numpy(state) # converting the numpy array into a torch tensor #print("State: ",state) done = True # when the game is done episode_length = 0 # initializing the length of an episode to 0 #print("Unsquuezed: ", Variable(state.unsqueeze(0))) while True: # repeat episode_length += 1 # incrementing the episode length by one model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps if done: # if it is the first iteration of the while loop or if the game was just done, then: cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero cx2 = Variable(torch.zeros(1, 256)) hx2 = Variable(torch.zeros(1, 256)) if params.cuda: cx.cuda() hx.cuda() else: # else: cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable cx2 = Variable(cx2.data) hx2 = Variable(hx2.data) if params.cuda: cx.cuda() hx.cuda() values = [] # initializing the list of values (V(S)) log_probs = [] # initializing the list of log probabilities rewards = [] # initializing the list of rewards entropies = [] # initializing the list of entropies for step in range(params.num_steps): # going through the num_steps exploration steps if params.cuda: value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)).cuda(), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states else: value, action_values, (hx, cx),(hx2,cx2) = model((Variable(state.unsqueeze(0)), (hx, cx),(hx2, cx2))) #print(action_values) prob = F.softmax(action_values,1) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b))) log_prob = F.log_softmax(action_values,1) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a)) entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x)) entropies.append(entropy) # storing the computed entropy action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution #print(action.numpy()) log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action values.append(value) # storing the value V(S) of the state log_probs.append(log_prob) # storing the log prob of the action state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1 if done: # if the episode is done: episode_length = 0 # we restart the environment state = env.reset() # we restart the environment state = torch.from_numpy(state) # tensorizing the new state rewards.append(reward) # storing the new observed reward if done: # if we are done break # we stop the exploration and we directly move on to the next step: the update of the shared model R = torch.zeros(1, 1) # intializing the cumulative reward if not done: # if we are not done: if params.cuda: value, _, _, _ = model((Variable(state.unsqueeze(0)).cuda(), (hx, cx))) else: value, _, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx),(hx2,cx2))) # we initialize the cumulative reward with the value of the last shared state #print(value) R = value.data # we initialize the cumulative reward with the value of the last shared state values.append(Variable(R)) # storing the value V(S) of the last reached state S policy_loss = 0 # initializing the policy loss value_loss = 0 # initializing the value loss R = Variable(R) # making sure the cumulative reward R is a torch Variable gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0 for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state) advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i] value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i)) policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss optimizer.zero_grad() # initializing the optimizer (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient optimizer.step() # running the optimization step
class BehavioralEmbeddedAgent(Agent): def __init__(self, load_dataset=True): super(BehavioralEmbeddedAgent, self).__init__() self.meta, self.data = preprocess_demonstrations() if load_dataset: # demonstration source self.meta = divide_dataset(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.val_dataset = DemonstrationMemory("val", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.full_dataset = DemonstrationMemory("full", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.val_sampler = DemonstrationBatchSampler(self.train_dataset, train=False) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.episodic_sampler = SequentialDemonstrationSampler( self.full_dataset) self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.loss_v_beta = torch.nn.KLDivLoss() self.loss_q_beta = torch.nn.KLDivLoss() self.loss_v_pi = torch.nn.KLDivLoss() self.loss_q_pi = torch.nn.KLDivLoss() self.histogram = torch.from_numpy(self.meta['histogram']).float() w_f, w_v, w_h = calc_hist_weights(self.histogram) w_f = torch.clamp(w_f, 0, 10).cuda() w_v = torch.clamp(w_v, 0, 10).cuda() w_h = torch.clamp(w_h, 0, 10).cuda() self.loss_beta_f = torch.nn.CrossEntropyLoss(size_average=True, weight=w_f) self.loss_beta_v = torch.nn.CrossEntropyLoss(size_average=True, weight=w_v) self.loss_beta_h = torch.nn.CrossEntropyLoss(size_average=True, weight=w_h) self.loss_pi_f = torch.nn.CrossEntropyLoss(size_average=False) self.loss_pi_v = torch.nn.CrossEntropyLoss(size_average=False) self.loss_pi_h = torch.nn.CrossEntropyLoss(size_average=False) self.behavioral_model = BehavioralDistEmbedding() self.behavioral_model.cuda() # actor critic setting self.actor_critic_model = ActorCritic() self.actor_critic_model.cuda() self.actor_critic_target = ActorCritic() self.actor_critic_target.cuda() # configure learning cnn_params = [ p[1] for p in self.behavioral_model.named_parameters() if "cnn" in p[0] ] emb_params = [ p[1] for p in self.behavioral_model.named_parameters() if "emb" in p[0] ] v_beta_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_v" in p[0] ] a_beta_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_adv" in p[0] ] beta_f_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_beta_f" in p[0] ] beta_v_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_beta_v" in p[0] ] beta_h_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_beta_h" in p[0] ] v_pi_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "critic_v" in p[0] ] a_pi_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "critic_adv" in p[0] ] pi_f_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "fc_actor_f" in p[0] ] pi_v_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "fc_actor_v" in p[0] ] pi_h_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "fc_actor_h" in p[0] ] # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer_critic_v = BehavioralEmbeddedAgent.set_optimizer( v_pi_params, 0.0008) self.scheduler_critic_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_critic_v, self.decay) self.optimizer_critic_q = BehavioralEmbeddedAgent.set_optimizer( v_pi_params + a_pi_params, 0.0008) self.scheduler_critic_q = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_critic_q, self.decay) self.optimizer_v_beta = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + v_beta_params, 0.0008) self.scheduler_v_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_v_beta, self.decay) self.optimizer_q_beta = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + v_beta_params + a_beta_params, 0.0008) self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_q_beta, self.decay) self.optimizer_beta_f = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + beta_f_params, 0.0008) self.scheduler_beta_f = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta_f, self.decay) self.optimizer_beta_v = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + beta_v_params, 0.0008) self.scheduler_beta_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta_v, self.decay) self.optimizer_beta_h = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + beta_h_params, 0.0008) self.scheduler_beta_h = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta_h, self.decay) self.optimizer_pi_f = BehavioralEmbeddedAgent.set_optimizer( pi_f_params, 0.0008) self.scheduler_pi_f = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_f, self.decay) self.optimizer_pi_v = BehavioralEmbeddedAgent.set_optimizer( pi_v_params, 0.0008) self.scheduler_pi_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_v, self.decay) self.optimizer_pi_h = BehavioralEmbeddedAgent.set_optimizer( pi_h_params, 0.0008) self.scheduler_pi_h = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_h, self.decay) actions = torch.LongTensor(consts.hotvec_matrix).cuda() self.actions_matrix = actions.unsqueeze(0) self.q_bins = consts.q_bins[args.game][:-1] / self.meta['avg_score'] # the long bins are already normalized self.v_bins = consts.v_bins[args.game][:-1] / self.meta['avg_score'] self.q_bins_torch = Variable(torch.from_numpy( consts.q_bins[args.game] / self.meta['avg_score']), requires_grad=False).cuda() self.v_bins_torch = Variable(torch.from_numpy( consts.v_bins[args.game] / self.meta['avg_score']), requires_grad=False).cuda() self.batch_range = np.arange(self.batch) self.zero = Variable(torch.zeros(1)) def flip_grad(self, parameters): for p in parameters: p.requires_grad = not p.requires_grad @staticmethod def individual_loss_fn_l2(argument): return abs(argument.data.cpu().numpy())**2 @staticmethod def individual_loss_fn_l1(argument): return abs(argument.data.cpu().numpy()) def save_checkpoint(self, path, aux=None): state = { 'behavioral_model': self.behavioral_model.state_dict(), 'actor_critic_model': self.actor_critic_model.state_dict(), 'optimizer_critic_v': self.optimizer_critic_v.state_dict(), 'optimizer_critic_q': self.optimizer_critic_q.state_dict(), 'optimizer_v_beta': self.optimizer_v_beta.state_dict(), 'optimizer_q_beta': self.optimizer_q_beta.state_dict(), 'optimizer_beta_f': self.optimizer_beta_f.state_dict(), 'optimizer_beta_v': self.optimizer_beta_v.state_dict(), 'optimizer_beta_h': self.optimizer_beta_h.state_dict(), 'optimizer_pi_f': self.optimizer_pi_f.state_dict(), 'optimizer_pi_v': self.optimizer_pi_v.state_dict(), 'optimizer_pi_h': self.optimizer_pi_h.state_dict(), 'aux': aux } torch.save(state, path) def load_checkpoint(self, path): state = torch.load(path) self.behavioral_model.load_state_dict(state['behavioral_model']) self.actor_critic_model.load_state_dict(state['actor_critic_model']) self.optimizer_critic_v.load_state_dict(state['optimizer_critic_v']) self.optimizer_critic_q.load_state_dict(state['optimizer_critic_q']) self.optimizer_v_beta.load_state_dict(state['optimizer_v_beta']) self.optimizer_q_beta.load_state_dict(state['optimizer_q_beta']) self.optimizer_beta_f.load_state_dict(state['optimizer_beta_f']) self.optimizer_beta_v.load_state_dict(state['optimizer_beta_v']) self.optimizer_beta_h.load_state_dict(state['optimizer_beta_h']) self.optimizer_pi_f.load_state_dict(state['optimizer_pi_f']) self.optimizer_pi_v.load_state_dict(state['optimizer_pi_v']) self.optimizer_pi_h.load_state_dict(state['optimizer_pi_h']) return state['aux'] def resume(self, model_path): aux = self.load_checkpoint(model_path) # self.update_target() return aux def update_target(self): self.actor_critic_target.load_state_dict( self.actor_critic_model.state_dict()) def batched_interp(self, x, xp, fp): # implemented with numpy x = x.data.cpu().numpy() xp = xp.data.cpu().numpy() fp = fp.data.cpu().numpy() y = np.zeros(x.shape) for i, (xl, xpl, fpl) in enumerate(zip(x, xp, fp)): y[i] = np.interp(xl, xpl, fpl) return Variable(torch.FloatTensor().cuda(), requires_grad=False) def new_distribution(self, q, beta, r, bin): bin = bin.repeat(self.batch, self.global_action_space, 1) r = r.unsqueeze(1).repeat(1, bin.shape[0]) beta = beta.unsqueeze(1) # dimensions: # bins [batch, actions, bins] # beta [batch, 1, actions] # new_bin = torch.baddbmm(r, beta, , alpha=self.discount) q_back.squeeze(1) return self.batched_interp(x, xp, fp) def learn(self, n_interval, n_tot): self.behavioral_model.train() self.actor_critic_model.train() self.actor_critic_target.eval() results = { 'n': [], 'loss_v': [], 'loss_q': [], 'loss_beta_f': [], 'loss_beta_v': [], 'loss_beta_h': [], 'loss_pi_s': [], 'loss_pi_l': [], 'loss_pi_s_tau': [], 'loss_pi_l_tau': [] } for n, sample in tqdm(enumerate(self.train_loader)): s = Variable(sample['s'].cuda(), requires_grad=False) a = Variable(sample['a'].cuda(), requires_grad=False) a_index = Variable(sample['a_index'].cuda(async=True), requires_grad=False) rl = np.digitize(sample['score'].numpy(), self.long_bins, right=True) rs = np.digitize(sample['f'].numpy(), self.short_bins, right=True) Rl = Variable(sample['score'].cuda(), requires_grad=False) Rs = Variable(sample['f'].cuda(), requires_grad=False) rl = Variable(torch.from_numpy(rl).cuda(), requires_grad=False) rs = Variable(torch.from_numpy(rs).cuda(), requires_grad=False) vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model( s, a) # policy learning if self.alpha_vs and train_net: loss_vs = self.alpha_vs * self.loss_fn_vs(vs, rs) self.optimizer_vs.zero_grad() loss_vs.backward(retain_graph=True) self.optimizer_vs.step() else: loss_vs = self.zero if self.alpha_vl and train_net: loss_vl = self.alpha_vl * self.loss_fn_vl(vl, rl) self.optimizer_vl.zero_grad() loss_vl.backward(retain_graph=True) self.optimizer_vl.step() else: loss_vl = self.zero if self.alpha_b and train_net: loss_b = self.alpha_b * self.loss_fn_beta(beta, a_index) self.optimizer_beta.zero_grad() loss_b.backward(retain_graph=True) self.optimizer_beta.step() else: loss_b = self.zero if self.alpha_qs and train_net: loss_qs = self.alpha_qs * self.loss_fn_qs(qs, rs) self.optimizer_qs.zero_grad() loss_qs.backward(retain_graph=True) self.optimizer_qs.step() else: loss_qs = self.zero if self.alpha_ql and train_net: loss_ql = self.alpha_ql * self.loss_fn_ql(ql, rl) self.optimizer_ql.zero_grad() loss_ql.backward(retain_graph=True) self.optimizer_ql.step() else: loss_ql = self.zero a_index_np = sample['a_index'].numpy() self.batch_range = np.arange(self.batch) beta_sfm = F.softmax(beta, 1) pi_s_sfm = F.softmax(pi_s, 1) pi_l_sfm = F.softmax(pi_l, 1) pi_s_tau_sfm = F.softmax(pi_s, 1) pi_l_tau_sfm = F.softmax(pi_l, 1) beta_fix = Variable(beta_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_s_fix = Variable(pi_s_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_l_fix = Variable(pi_l_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_s_tau_fix = Variable(pi_s_tau_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_l_tau_fix = Variable(pi_l_tau_sfm.data[self.batch_range, a_index_np], requires_grad=False) if self.alpha_pi_s and not train_net: loss_pi_s = self.alpha_pi_s * self.loss_fn_pi_s(pi_s, a_index) loss_pi_s = (loss_pi_s * Rs * self.off_factor(pi_s_fix, beta_fix)).mean() self.optimizer_pi_s.zero_grad() loss_pi_s.backward(retain_graph=True) self.optimizer_pi_s.step() else: loss_pi_s = self.zero if self.alpha_pi_l and not train_net: loss_pi_l = self.alpha_pi_l * self.loss_fn_pi_l(pi_l, a_index) loss_pi_l = (loss_pi_l * Rl * self.off_factor(pi_l_fix, beta_fix)).mean() self.optimizer_pi_l.zero_grad() loss_pi_l.backward(retain_graph=True) self.optimizer_pi_l.step() else: loss_pi_l = self.zero if self.alpha_pi_s_tau and not train_net: loss_pi_s_tau = self.alpha_pi_s_tau * self.loss_fn_pi_s_tau( pi_s_tau, a_index) w = self.get_weighted_loss(F.softmax(qs, 1), self.short_bins_torch) loss_pi_s_tau = ( loss_pi_s_tau * w * self.off_factor(pi_s_tau_fix, beta_fix)).mean() self.optimizer_pi_s_tau.zero_grad() loss_pi_s_tau.backward(retain_graph=True) self.optimizer_pi_s_tau.step() else: loss_pi_s_tau = self.zero if self.alpha_pi_l_tau and not train_net: loss_pi_l_tau = self.alpha_pi_l_tau * self.loss_fn_pi_l_tau( pi_l_tau, a_index) w = self.get_weighted_loss(F.softmax(ql, 1), self.long_bins_torch) loss_pi_l_tau = ( loss_pi_l_tau * w * self.off_factor(pi_l_tau_fix, beta_fix)).mean() self.optimizer_pi_l_tau.zero_grad() loss_pi_l_tau.backward() self.optimizer_pi_l_tau.step() else: loss_pi_l_tau = self.zero # add results results['loss_vs'].append(loss_vs.data.cpu().numpy()[0]) results['loss_vl'].append(loss_vl.data.cpu().numpy()[0]) results['loss_b'].append(loss_b.data.cpu().numpy()[0]) results['loss_qs'].append(loss_qs.data.cpu().numpy()[0]) results['loss_ql'].append(loss_ql.data.cpu().numpy()[0]) results['loss_pi_s'].append(loss_pi_s.data.cpu().numpy()[0]) results['loss_pi_l'].append(loss_pi_l.data.cpu().numpy()[0]) results['loss_pi_s_tau'].append( loss_pi_s_tau.data.cpu().numpy()[0]) results['loss_pi_l_tau'].append( loss_pi_l_tau.data.cpu().numpy()[0]) results['n'].append(n) # if not n % self.update_target_interval: # # self.update_target() # if an index is rolled more than once during update_memory_interval period, only the last occurance affect the if not ( n + 1 ) % self.update_memory_interval and self.prioritized_replay: self.train_dataset.update_probabilities() # update a global n_step parameter if not (n + 1) % self.update_n_steps_interval: # self.train_dataset.update_n_step(n + 1) d = np.divmod(n + 1, self.update_n_steps_interval)[0] if d % 10 == 1: self.flip_grad(self.parameters_group_b + self.parameters_group_a) train_net = not train_net if d % 10 == 2: self.flip_grad(self.parameters_group_b + self.parameters_group_a) train_net = not train_net self.scheduler_pi_s.step() self.scheduler_pi_l.step() self.scheduler_pi_s_tau.step() self.scheduler_pi_l_tau.step() else: self.scheduler_vs.step() self.scheduler_beta.step() self.scheduler_vl.step() self.scheduler_qs.step() self.scheduler_ql.step() if not (n + 1) % n_interval: yield results self.model.train() # self.target.eval() results = {key: [] for key in results} def off_factor(self, pi, beta): return torch.clamp(pi / beta, 0, 1) def test(self, n_interval, n_tot): self.model.eval() # self.target.eval() results = { 'n': [], 'loss_vs': [], 'loss_b': [], 'loss_vl': [], 'loss_qs': [], 'loss_ql': [], 'act_diff': [], 'a_agent': [], 'a_player': [], 'loss_pi_s': [], 'loss_pi_l': [], 'loss_pi_s_tau': [], 'loss_pi_l_tau': [] } for n, sample in tqdm(enumerate(self.test_loader)): s = Variable(sample['s'].cuda(), requires_grad=False) a = Variable(sample['a'].cuda().unsqueeze(1), requires_grad=False) a_index = Variable(sample['a_index'].cuda(async=True), requires_grad=False) rl = np.digitize(sample['score'].numpy(), self.long_bins, right=True) rs = np.digitize(sample['f'].numpy(), self.short_bins, right=True) Rl = Variable(sample['score'].cuda(), requires_grad=False) Rs = Variable(sample['f'].cuda(), requires_grad=False) rl = Variable(torch.from_numpy(rl).cuda(), requires_grad=False) rs = Variable(torch.from_numpy(rs).cuda(), requires_grad=False) vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model( s, a) qs = qs.squeeze(1) ql = ql.squeeze(1) # policy learning loss_vs = self.alpha_vs * self.loss_fn_vs(vs, rs) loss_vl = self.alpha_vl * self.loss_fn_vl(vl, rl) loss_b = self.alpha_b * self.loss_fn_beta(beta, a_index) loss_qs = self.alpha_qs * self.loss_fn_qs(qs, rs) loss_ql = self.alpha_ql * self.loss_fn_ql(ql, rl) a_index_np = sample['a_index'].numpy() self.batch_range = np.arange(self.batch) beta_sfm = F.softmax(beta, 1) pi_s_sfm = F.softmax(pi_s, 1) pi_l_sfm = F.softmax(pi_l, 1) pi_s_tau_sfm = F.softmax(pi_s, 1) pi_l_tau_sfm = F.softmax(pi_l, 1) beta_fix = Variable(beta_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_s_fix = Variable(pi_s_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_l_fix = Variable(pi_l_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_s_tau_fix = Variable(pi_s_tau_sfm.data[self.batch_range, a_index_np], requires_grad=False) pi_l_tau_fix = Variable(pi_l_tau_sfm.data[self.batch_range, a_index_np], requires_grad=False) loss_pi_s = self.alpha_pi_s * self.loss_fn_pi_s(pi_s, a_index) loss_pi_s = (loss_pi_s * Rs * self.off_factor(pi_s_fix, beta_fix)).mean() loss_pi_l = self.alpha_pi_l * self.loss_fn_pi_l(pi_l, a_index) loss_pi_l = (loss_pi_l * Rl * self.off_factor(pi_l_fix, beta_fix)).mean() loss_pi_s_tau = self.alpha_pi_s_tau * self.loss_fn_pi_s_tau( pi_s_tau, a_index) w = self.get_weighted_loss(F.softmax(qs, 1), self.short_bins_torch) loss_pi_s_tau = (loss_pi_s_tau * w * self.off_factor(pi_s_tau_fix, beta_fix)).mean() loss_pi_l_tau = self.alpha_pi_l_tau * self.loss_fn_pi_l_tau( pi_l_tau, a_index) w = self.get_weighted_loss(F.softmax(ql, 1), self.long_bins_torch) loss_pi_l_tau = (loss_pi_l_tau * w * self.off_factor(pi_l_tau_fix, beta_fix)).mean() # collect actions statistics a_index_np = a_index.data.cpu().numpy() _, beta_index = beta.data.cpu().max(1) beta_index = beta_index.numpy() act_diff = (a_index_np != beta_index).astype(np.int) # add results results['act_diff'].append(act_diff) results['a_agent'].append(beta_index) results['a_player'].append(a_index_np) results['loss_vs'].append(loss_vs.data.cpu().numpy()[0]) results['loss_vl'].append(loss_vl.data.cpu().numpy()[0]) results['loss_b'].append(loss_b.data.cpu().numpy()[0]) results['loss_qs'].append(loss_qs.data.cpu().numpy()[0]) results['loss_ql'].append(loss_ql.data.cpu().numpy()[0]) results['loss_pi_s'].append(loss_pi_s.data.cpu().numpy()[0]) results['loss_pi_l'].append(loss_pi_l.data.cpu().numpy()[0]) results['loss_pi_s_tau'].append( loss_pi_s_tau.data.cpu().numpy()[0]) results['loss_pi_l_tau'].append( loss_pi_l_tau.data.cpu().numpy()[0]) results['n'].append(n) if not (n + 1) % n_interval: results['s'] = s.data.cpu() results['act_diff'] = np.concatenate(results['act_diff']) results['a_agent'] = np.concatenate(results['a_agent']) results['a_player'] = np.concatenate(results['a_player']) yield results self.model.eval() # self.target.eval() results = {key: [] for key in results} def play_stochastic(self, n_tot): raise NotImplementedError def play_episode(self, n_tot): self.model.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() # mask = torch.FloatTensor(consts.actions_mask[args.game]) # mask = Variable(mask.cuda(), requires_grad=False) vsx = torch.FloatTensor(consts.short_bins[args.game]) vlx = torch.FloatTensor(consts.long_bins[args.game]) for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model( s, self.actions_matrix) beta = beta.squeeze(0) pi_l = pi_l.squeeze(0) pi_s = pi_s.squeeze(0) pi_l_tau = pi_l_tau.squeeze(0) pi_s_tau = pi_s_tau.squeeze(0) temp = 1 # consider only 3 most frequent actions beta_np = beta.data.cpu().numpy() indices = np.argsort(beta_np) maskb = Variable(torch.FloatTensor( [0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False).cuda() # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), # requires_grad=False).cuda() # pi = maskb * (beta / beta.max()) pi = beta self.greedy = True beta_prob = pi if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # a = np.random.choice(choices) if self.greedy and eps > 0.1: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) vs = softmax(vs) vl = softmax(vl) vs = torch.sum(vsx * vs.data.cpu()) vl = torch.sum(vlx * vl.data.cpu()) yield { 'o': env.s.cpu().numpy(), 'vs': np.array([vs]), 'vl': np.array([vl]), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta_prob.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'qs': qs.squeeze(0).data.cpu().numpy(), 'ql': ql.squeeze(0).data.cpu().numpy(), } j += 1 raise StopIteration def policy(self, vs, vl, beta, qs, ql): pass
# 학습 시키기 env = Environment1(train) env.reset() input_size = env.history_t + 1 output_size = 3 USE_CUDA = False LR = 0.001 torch.manual_seed(0) Q = ActorCritic(input_size, output_size) Q_ast = copy.deepcopy(Q) if USE_CUDA: Q = Q.cuda() loss_function = nn.MSELoss() optimizer = optim.Adam(list(Q.parameters()), lr=LR) epoch_num = 50 step_max = len(env.data) - 1 memory_size = 200 batch_size = 50 gamma = 0.97 obs, reward, done = env.step(5) memory = [] total_step = 0 total_rewards = [] total_losses = []
def train(rank, args, shared_model, optimizer=None): mse_loss = torch.nn.MSELoss() nll_loss = torch.nn.NLLLoss() torch.manual_seed(args.seed + rank) env = env_wrapper.create_doom(args.record, outdir=args.outdir) num_outputs = env.action_space.n model = ActorCritic(env.observation_space.shape[0], env.action_space) model.cuda() if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] inverses = [] forwards = [] actions = [] vec_st1s = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)).cuda(), (hx.cuda(), cx.cuda())), icm=False) s_t = state prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) # sample an action action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) oh_action = torch.Tensor(1, num_outputs) oh_action.zero_() oh_action.scatter_(1, action.cpu(), 1) oh_action = Variable(oh_action).cuda() a_t = oh_action actions.append(oh_action) state, reward, done, _ = env.step(action.cpu().numpy()[0][0]) if done: #print 'total reward', _['TOTAL_REWARD'] print 'reward ', reward #print 'kill count', _['KILLCOUNT'] state = torch.from_numpy(state) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) s_t1 = state vec_st1, inverse, forward = model( (Variable(s_t.unsqueeze(0)).cuda(), Variable( s_t1.unsqueeze(0)).cuda(), a_t), icm=True) reward_intrinsic = args.eta * ( (vec_st1 - forward).pow(2)).sum(1) / 2. #reward_intrinsic = args.eta * ((vec_st1 - forward).pow(2)).sum(1).sqrt() / 2. reward_intrinsic = reward_intrinsic.cpu().data.numpy()[0] reward += reward_intrinsic if done: print 'done at ', episode_length * args.num_steps episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) vec_st1s.append(vec_st1) inverses.append(inverse) forwards.append(forward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model( (Variable(state.unsqueeze(0)).cuda(), (hx.cuda(), cx.cuda())), icm=False) R = value.cpu().data values.append(Variable(R).cuda()) policy_loss = 0 value_loss = 0 inverse_loss = 0 forward_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i].cpu() #print value_loss value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].cpu().data - values[i].cpu().data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i].cpu() * Variable(gae) - 0.01 * entropies[i].cpu() cross_entropy = -(actions[i] * torch.log(inverses[i] + 1e-15)).sum(1) inverse_loss = inverse_loss + cross_entropy forward_err = forwards[i] - vec_st1s[i] forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1) optimizer.zero_grad() ((1 - args.beta) * inverse_loss + args.beta * forward_loss).backward(retain_variables=True) (args.lmbda * (policy_loss + 0.5 * value_loss)).backward() #(((1-args.beta) * inverse_loss + args.beta * forward_loss) + args.lmbda * (policy_loss + 0.5 * value_loss)).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) if (episode_length + 1) % 50 == 0: log = 'step %d: forward loss %.5f, inverse loss %.5f, cross_entropy %.5f \n' % ( episode_length, forward_loss, inverse_loss, cross_entropy) print(log) ensure_shared_grads(model, shared_model) optimizer.step()
def main(): time_str = time.strftime("%Y%m%d-%H%M%S") print('time_str: ', time_str) exp_count = 0 if args.experiment == 'a|s': direc_name_ = '_'.join([args.env, args.experiment]) else: direc_name_ = '_'.join( [args.env, args.experiment, 'bp2VAE', str(args.bp2VAE)]) direc_name_exist = True while direc_name_exist: exp_count += 1 direc_name = '/'.join([direc_name_, str(exp_count)]) direc_name_exist = os.path.exists(direc_name) try: os.makedirs(direc_name) except OSError as e: if e.errno != errno.EEXIST: raise if args.tensorboard_dir is None: logger = Logger('/'.join([direc_name, time_str])) else: logger = Logger(args.tensorboard_dir) env = gym.make(args.env) if args.wrapper: if args.video_dir is None: args.video_dir = '/'.join([direc_name, 'videos']) env = gym.wrappers.Monitor(env, args.video_dir, force=True) print('observation_space: ', env.observation_space) print('action_space: ', env.action_space) env.seed(args.seed) torch.manual_seed(args.seed) if args.experiment == 'a|s': dim_x = env.observation_space.shape[0] elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' or \ args.experiment == 'a|z(a_prev, s, s_next)': dim_x = args.z_dim policy = ActorCritic(input_size=dim_x, hidden1_size=3 * dim_x, hidden2_size=6 * dim_x, action_size=env.action_space.n) if args.use_cuda: Tensor = torch.cuda.FloatTensor torch.cuda.manual_seed_all(args.seed) policy.cuda() else: Tensor = torch.FloatTensor policy_optimizer = optim.Adam(policy.parameters(), lr=args.policy_lr) if args.experiment != 'a|s': from util import ReplayBuffer, vae_loss_function dim_s = env.observation_space.shape[0] if args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)': from model import VAE vae = VAE(input_size=dim_s, hidden1_size=3 * args.z_dim, hidden2_size=args.z_dim) elif args.experiment == 'a|z(a_prev, s, s_next)': from model import CVAE vae = CVAE(input_size=dim_s, class_size=1, hidden1_size=3 * args.z_dim, hidden2_size=args.z_dim) if args.use_cuda: vae.cuda() vae_optimizer = optim.Adam(vae.parameters(), lr=args.vae_lr) if args.experiment == 'a|z(s)': from util import Transition_S2S as Transition elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': from util import Transition_S2SNext as Transition buffer = ReplayBuffer(args.buffer_capacity, Transition) update_vae = True if args.experiment == 'a|s': from util import Record_S elif args.experiment == 'a|z(s)': from util import Record_S2S elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': from util import Record_S2SNext def train_actor_critic(n): saved_info = policy.saved_info R = 0 cum_returns_ = [] for r in policy.rewards[::-1]: R = r + args.gamma * R cum_returns_.insert(0, R) cum_returns = Tensor(cum_returns_) cum_returns = (cum_returns - cum_returns.mean()) \ / (cum_returns.std() + np.finfo(np.float32).eps) cum_returns = Variable(cum_returns, requires_grad=False).unsqueeze(1) batch_info = SavedInfo(*zip(*saved_info)) batch_log_prob = torch.cat(batch_info.log_prob) batch_value = torch.cat(batch_info.value) batch_adv = cum_returns - batch_value policy_loss = -torch.sum(batch_log_prob * batch_adv) value_loss = F.smooth_l1_loss(batch_value, cum_returns, size_average=False) policy_optimizer.zero_grad() total_loss = policy_loss + value_loss total_loss.backward() policy_optimizer.step() if args.use_cuda: logger.scalar_summary('value_loss', value_loss.data.cpu()[0], n) logger.scalar_summary('policy_loss', policy_loss.data.cpu()[0], n) all_value_loss.append(value_loss.data.cpu()[0]) all_policy_loss.append(policy_loss.data.cpu()[0]) else: logger.scalar_summary('value_loss', value_loss.data[0], n) logger.scalar_summary('policy_loss', policy_loss.data[0], n) all_value_loss.append(value_loss.data[0]) all_policy_loss.append(policy_loss.data[0]) del policy.rewards[:] del policy.saved_info[:] if args.experiment != 'a|s': def train_vae(n): train_times = (n // args.vae_update_frequency - 1) * args.vae_update_times for i in range(args.vae_update_times): train_times += 1 sample = buffer.sample(args.batch_size) batch = Transition(*zip(*sample)) state_batch = torch.cat(batch.state) if args.experiment == 'a|z(s)': recon_batch, mu, log_var = vae.forward(state_batch) mse_loss, kl_loss = vae_loss_function( recon_batch, state_batch, mu, log_var, logger, train_times, kl_discount=args.kl_weight, mode=args.experiment) elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': next_state_batch = Variable(torch.cat(batch.next_state), requires_grad=False) predicted_batch, mu, log_var = vae.forward(state_batch) mse_loss, kl_loss = vae_loss_function( predicted_batch, next_state_batch, mu, log_var, logger, train_times, kl_discount=args.kl_weight, mode=args.experiment) vae_loss = mse_loss + kl_loss vae_optimizer.zero_grad() vae_loss.backward() vae_optimizer.step() logger.scalar_summary('vae_loss', vae_loss.data[0], train_times) all_vae_loss.append(vae_loss.data[0]) all_mse_loss.append(mse_loss.data[0]) all_kl_loss.append(kl_loss.data[0]) # To store cum_reward, value_loss and policy_loss from each episode all_cum_reward = [] all_last_hundred_average = [] all_value_loss = [] all_policy_loss = [] if args.experiment != 'a|s': # Store each vae_loss calculated all_vae_loss = [] all_mse_loss = [] all_kl_loss = [] for episode in count(1): done = False state_ = torch.Tensor([env.reset()]) cum_reward = 0 if args.experiment == 'a|z(a_prev, s, s_next)': action = random.randint(0, 2) state_, reward, done, info = env.step(action) cum_reward += reward state_ = torch.Tensor([np.append(state_, action)]) while not done: if args.experiment == 'a|s': state = Variable(state_, requires_grad=False) elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' \ or args.experiment == 'a|z(a_prev, s, s_next)': state_ = Variable(state_, requires_grad=False) mu, log_var = vae.encode(state_) if args.bp2VAE and update_vae: state = vae.reparametrize(mu, log_var) else: state = vae.reparametrize(mu, log_var).detach() action_ = policy.select_action(state) if args.use_cuda: action = action_.cpu()[0, 0] else: action = action_[0, 0] next_state_, reward, done, info = env.step(action) next_state_ = torch.Tensor([next_state_]) cum_reward += reward if args.render: env.render() policy.rewards.append(reward) if args.experiment == 'a|z(s)': buffer.push(state_) elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': if not done: buffer.push(state_, next_state_) if args.experiment == 'a|z(a_prev, s, s_next)': next_state_ = torch.cat( [next_state_, torch.Tensor([action])], 1) state_ = next_state_ train_actor_critic(episode) last_hundred_average = sum(all_cum_reward[-100:]) / 100 logger.scalar_summary('cum_reward', cum_reward, episode) logger.scalar_summary('last_hundred_average', last_hundred_average, episode) all_cum_reward.append(cum_reward) all_last_hundred_average.append(last_hundred_average) if update_vae: if args.experiment != 'a|s' and episode % args.vae_update_frequency == 0: assert len(buffer) >= args.batch_size train_vae(episode) if len(all_vae_loss) > 1000: if abs( sum(all_vae_loss[-500:]) / 500 - sum(all_vae_loss[-1000:-500]) / 500) < args.vae_update_threshold: update_vae = False if episode % args.log_interval == 0: print( 'Episode {}\tLast cum return: {:5f}\t100-episodes average cum return: {:.2f}' .format(episode, cum_reward, last_hundred_average)) if episode > args.num_episodes: print("100-episodes average cum return is now {} and " "the last episode runs to {} time steps!".format( last_hundred_average, cum_reward)) env.close() torch.save(policy, '/'.join([direc_name, 'model'])) if args.experiment == 'a|s': record = Record_S( policy_loss=all_policy_loss, value_loss=all_value_loss, cum_reward=all_cum_reward, last_hundred_average=all_last_hundred_average) elif args.experiment == 'a|z(s)': record = Record_S2S( policy_loss=all_policy_loss, value_loss=all_value_loss, cum_reward=all_cum_reward, last_hundred_average=all_last_hundred_average, mse_recon_loss=all_mse_loss, kl_loss=all_kl_loss, vae_loss=all_vae_loss) elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': record = Record_S2SNext( policy_loss=all_policy_loss, value_loss=all_value_loss, cum_reward=all_cum_reward, last_hundred_average=all_last_hundred_average, mse_pred_loss=all_mse_loss, kl_loss=all_kl_loss, vae_loss=all_vae_loss) pickle.dump(record, open('/'.join([direc_name, 'record']), 'wb')) break