def optimize(self, params, optimizer, shared_model, gpu_id): if 'Unreal' in self.args.env: self.gate_ids = self.env.env.env.env.gate_ids else: self.gate_ids = self.env.gate_ids self.random_ids = self.env.random_ids R = torch.zeros(self.num_agents, 1).to(self.device) if not self.done: # predict value state = self.state value_multi, *others = self.model( ( (Variable(state, requires_grad=True), (self.img_hxs, self.img_cxs)), (Variable(torch.Tensor(self.cam_info), requires_grad=True), (self.pose_hxs, self.pose_cxs)), (Variable(torch.Tensor(self.pre_actions), requires_grad=True), Variable(torch.Tensor(self.gate_ids)), Variable(torch.Tensor(self.random_ids))) ) ) for i in range(self.num_agents): R[i][0] = value_multi[i].data self.values.append(Variable(R).to(self.device)) policy_loss = torch.zeros(self.num_agents, 1).to(self.device) value_loss = torch.zeros(self.num_agents, 1).to(self.device) pred_loss = torch.zeros(1, 1).to(self.device) entropies = torch.zeros(self.num_agents, 1).to(self.device) w_entropies = torch.Tensor([[float(self.args.entropy)] for i in range(self.num_agents)]).to(self.device) R = Variable(R, requires_grad=True).to(self.device) gae = torch.zeros(1, 1).to(self.device) for i in reversed(range(len(self.rewards))): R = self.args.gamma * R + self.rewards[i] advantage = R - self.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = self.rewards[i] + self.args.gamma * self.values[i + 1].data - self.values[i].data gae = gae * self.args.gamma * self.args.tau + delta_t policy_loss = policy_loss - \ (self.log_probs[i] * Variable(gae)) - \ (w_entropies * self.entropies[i]) entropies += self.entropies[i] policy_loss = policy_loss[self.env.random_ids] value_loss = value_loss[self.env.random_ids] loss = policy_loss.sum() + 0.5 * value_loss.sum() self.model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(params, 50) ensure_shared_grads(self.model, shared_model, gpu=gpu_id >= 0) optimizer.step() values0 = self.values[0].data self.clear_actions() return policy_loss, value_loss, entropies, pred_loss, values0
def train(shared_model, optimizer, wholes, scaffolds, whole_conditions, scaffold_conditions, pid, retval_list, args): """\ Target function for the multiprocessed training. In addition to updating model parameters, loss values are collected by `retval_list` after each `forward`. Parameters ---------- shared_model: torch.nn.Module A shared model to be trained. optimizer: torch.optim.Optimizer A shared optimizer. wholes: list[str] A list of whole-molecule SMILESs. scaffolds: list[str] A list of scaffold SMILESs. whole_conditions: list[ list[float] ] [ [ value1, value2, ... ], # condition values of whole 1 [ value1, value2, ... ], # condition values of whole 2 ] scaffold_conditions: list[ list[float] ] Similar to `whole_conditions`, but with scaffold values. pid: int CPU index. retval_list: list[multiprocessing.managers.ListProxy] A list of lists to collect loss floats from CPUs. In each cycle, the final shape will be: (ncpus, minibatch_size, num_of_losses) args: argparse.Namespace Delivers parameters from command arguments to the model. """ #each thread make new model model=ggm(args) for idx in range(len(wholes)): #set parameters of model as same as that of reference model model.load_state_dict(shared_model.state_dict()) model.zero_grad() optimizer.zero_grad() #forward retval = model(wholes[idx], scaffolds[idx], whole_conditions[idx], scaffold_conditions[idx], args.shuffle_order) #if retval is None, some error occured. it is usually due to invalid smiles if retval is None: continue #train model g_gen, h_gen, loss1, loss2, loss3 = retval loss = loss1 + loss2*args.beta1 + loss3 # torch.autograd.Variable of shape (1,) retval_list[pid].append((loss.data.cpu().numpy()[0], loss1.data.cpu().numpy()[0], loss2.data.cpu().numpy()[0], loss3.data.cpu().numpy()[0])) loss.backward() #torch.nn.utils.clip_grad_norm(model.parameters(), 0.5) utils.ensure_shared_grads(model, shared_model, True) optimizer.step()
def work(self): """ Worker training procedure """ self.step = 0 self.model_state = copy.deepcopy( self.local_model.init_state(self.device)) while True: self.step += 1 # update local variables with the weights # of the global net if self.cfg.USE_GPU: with torch.cuda.device(self.gpu_id): self.local_model.load_state_dict( self.global_model.state_dict()) else: self.local_model.load_state_dict( self.global_model.state_dict()) # accumulate some experience # and build the loss loss = self.process_rollout() # backward pass and # update the global model weights self.local_model.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, self.local_model.parameters()), 40.0) ut.ensure_shared_grads(self.local_model, self.global_model, use_gpu=self.cfg.USE_GPU) self.optimizer.step() self.logger.log_value('loss', self.step, loss.item(), print_value=False, to_file=False) if (self.step % self.cfg.SAVE_STEP) == 0 and ( self.ident % 4 == 0): #self.name == 'a3c_train_worker_0': torch.save(self.global_model.state_dict(), self.ckpt_path) print('Variables saved') if self.episode_count > self.cfg.MAX_EPISODES: # terminate the training if self.worker_name == 'a3c_train_worker_0': torch.save(self.global_model.state_dict(), self.ckpt_path) print('Variables saved') break
def training(self, next_observation, shared_model, shared_optimizer, params): self.model.train() self.n_update += 1 self.cx = Variable(self.cx.data) self.hx = Variable(self.hx.data) R = torch.zeros(1, 1) if not self.done: self.state = preprocess(next_observation) with torch.cuda.device(self.gpu_id): obs = Variable(torch.FloatTensor(self.state)).cuda() value, _, _, _, _ = self.model(obs, self.target, self.hx, self.cx, self.eps_len, self.external_memory, self.gpu_id) R = value.data if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): R = R.cuda() R = Variable(R) self.values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): gae = gae.cuda() for i in reversed(range(len(self.rewards))): R = params.gamma * R + self.rewards[i] advantage = R - self.values[i] value_loss = value_loss + advantage.pow(2) # 0.5 * # Generalized Advantage Estimataion delta_t = params.gamma * self.values[ i + 1].data - self.values[i].data + self.rewards[i] gae = gae * params.gamma * params.tau + delta_t policy_loss = policy_loss - self.log_probs[i] * Variable( gae) - params.entropy_coef * self.entropies[i] self.model.zero_grad() (policy_loss + params.value_loss_coef * value_loss).backward() #retain_graph=True clip_grad_norm_(self.model.parameters(), 1.0) ensure_shared_grads(self.model, shared_model, gpu=self.gpu_id >= 0) shared_optimizer.step() with torch.cuda.device(self.gpu_id): self.model.load_state_dict( shared_model.state_dict()) #model update self.clear_actions()
def train(self, global_t, summary_writer=None): t = self.local_t if not self.replay_buffer.is_full(): self.fill_experience() return 0 # time_step = 0 # sync if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.model.load_state_dict(self.shared_model.state_dict()) else: self.model.load_state_dict(self.shared_model.state_dict()) loss_a3c, episode_score = self.process_a3c() # 获取 hx, cx h0, c0 = self.hx.detach(), self.cx.detach() loss_pc = self.process_pc(h0=h0, c0=c0) h0, c0 = self.hx.detach(), self.cx.detach() loss_vr = self.process_vr(h0, c0) loss_rp = self.process_rp() loss = loss_a3c + loss_pc + loss_vr + loss_rp self.model.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), 40.0) ensure_shared_grads(self.model, self.shared_model, gpu=self.gpu_id >= 0) self.adjust_learning_rate(optimizer=self.optimizer, global_time_step=global_t) self.optimizer.step() if summary_writer is not None: with torch.no_grad(): losses = list( map(lambda x: float(x.detach().cpu().numpy()), [loss_a3c, loss_pc, loss_vr, loss_rp, loss])) tags = dict( zip(['a3c', 'pc', 'vr', 'rp', 'total_loss'], losses)) summary_writer.add_scalars('losses', tags, global_step=global_t) # 分数 if episode_score: summary_writer.add_scalars('score', {'score': episode_score}, global_step=global_t) self._print_log(global_t) return self.local_t - t # offset
def training(self, next_obs, shared_model, shared_optimizer, params): #pdb.set_trace() # self.model.train() self.cx = Variable(self.cx.data) self.hx = Variable(self.hx.data) R = torch.zeros(1, 1) if not self.done: state = preprocessing(next_obs, self.obs_old, self.gpu_id) value, _, _, _ = self.model(state, self.hx, self.cx) R = value.data with torch.cuda.device(self.gpu_id): R = R.cuda() R = Variable(R) self.values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) with torch.cuda.device(self.gpu_id): gae = gae.cuda() for i in reversed(range(len(self.rewards))): R = params.gamma * R + self.rewards[i] advantage = R - self.values[i] value_loss = value_loss + advantage.pow(2) # 0.5 * # Generalized Advantage Estimation delta_t = params.gamma * self.values[ i + 1].data - self.values[i].data + self.rewards[i] gae = gae * params.gamma * params.tau + delta_t policy_loss = policy_loss - self.log_probs[i] * Variable( gae) - params.entropy_coef * self.entropies[i] shared_optimizer.zero_grad() loss = policy_loss + params.value_loss_coef * value_loss loss.backward() clip_grad_norm_(self.model.parameters(), 50.0) ensure_shared_grads(self.model, shared_model, gpu=self.gpu_id >= 0) shared_optimizer.step() # self.synchronize(shared_model) with torch.cuda.device(self.gpu_id): self.model.load_state_dict(shared_model.state_dict()) self.clear_all()
def optimize(self, params, optimizer, shared_model, training_mode, device_share): R = torch.zeros(len(self.rewards[0]), 1).to(self.device) if not self.done: # predict value state = self.state value_multi, *others = self.model( Variable(state, requires_grad=True)) for i in range(len(self.rewards[0])): # num_agent R[i][0] = value_multi[i].data self.values.append(Variable(R).to(self.device)) batch_size = len(self.entropies[0][0]) policy_loss = torch.zeros(batch_size, 1).to(self.device) value_loss = torch.zeros(1, 1).to(self.device) entropies = torch.zeros(batch_size, self.dim_action).to(self.device) w_entropies = float(self.args.entropy) R = Variable(R, requires_grad=True).to(self.device) gae = torch.zeros(1, 1).to(self.device) for i in reversed(range(len(self.rewards))): R = self.args.gamma * R + self.rewards[i] advantage = R - self.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = self.rewards[i] + self.args.gamma * self.values[ i + 1].data - self.values[i].data gae = gae * self.args.gamma * self.args.tau + delta_t policy_loss = policy_loss - \ (self.log_probs[i] * Variable(gae)) - \ (w_entropies * self.entropies[i]) entropies += self.entropies[i].sum() self.model.zero_grad() loss = policy_loss.sum() + 0.5 * value_loss.sum() loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(params, 50) ensure_shared_grads(self.model, shared_model, self.device, device_share) optimizer.step() self.clean_buffer(self.done) return policy_loss, value_loss, entropies
def train (rank, args, shared_model, optimizer, env_conf, datasets=None): ptitle('Training Agent: {}'.format(rank)) print ('Start training agent: ', rank) if rank == 0: logger = Logger (args.log_dir) train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf ["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) if "EM_env" in args.env: raw, lbl, prob, gt_lbl = datasets env = EM_env (raw, lbl, prob, env_conf, 'train', gt_lbl) else: env = Voronoi_env (env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop (shared_model.parameters (), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam (shared_model.parameters (), lr=args.lr, amsgrad=args.amsgrad) # env.seed (args.seed + rank) if not args.continuous: player = Agent (None, env, args, None) else: player = Agent_continuous (None, env, args, None) player.gpu_id = gpu_id if not args.continuous: player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) else: player.model = A3Clstm_continuous (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.state = player.env.reset () player.state = torch.from_numpy (player.state).float () old_score = player.env.old_score final_score = 0 if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () player.model = player.model.cuda () player.model.train () if rank == 0: eps_reward = 0 pinned_eps_reward = 0 mean_log_prob = 0 # print ("rank: ", rank) while True: if gpu_id >= 0: with torch.cuda.device (gpu_id): player.model.load_state_dict (shared_model.state_dict ()) else: player.model.load_state_dict (shared_model.state_dict ()) if player.done: player.eps_len = 0 if rank == 0: if 0 <= (train_step % args.train_log_period) < args.max_episode_length: print ("train: step", train_step, "\teps_reward", eps_reward, "\timprovement", final_score - old_score) old_score = player.env.old_score pinned_eps_reward = eps_reward eps_reward = 0 mean_log_prob = 0 if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, args.hidden_feat).cuda()) player.hx = Variable(torch.zeros(1, args.hidden_feat).cuda()) else: player.cx = Variable(torch.zeros(1, args.hidden_feat)) player.hx = Variable(torch.zeros(1, args.hidden_feat)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train () if rank == 0: # if 0 <= (train_step % args.train_log_period) < args.max_episode_length: # print ("train: step", train_step, "\taction = ", player.action) eps_reward += player.reward # print (eps_reward) mean_log_prob += player.log_probs [-1] / env_conf ["T"] if player.done: break if player.done: # if rank == 0: # print ("----------------------------------------------") final_score = player.env.old_score state = player.env.reset () player.state = torch.from_numpy (state).float () if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () R = torch.zeros (1, 1) if not player.done: if not args.continuous: value, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = player.values[i + 1].data * args.gamma + player.rewards[i] - \ player.values[i].data gae = gae * args.gamma * args.tau + delta_t # print (player.rewards [i]) if not args.continuous: policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] else: policy_loss = policy_loss - \ player.log_probs[i].sum () * Variable(gae) - \ 0.01 * player.entropies[i].sum () player.model.zero_grad () sum_loss = (policy_loss + value_loss) sum_loss.backward () ensure_shared_grads (player.model, shared_model, gpu=gpu_id >= 0) optimizer.step () player.clear_actions () if rank == 0: train_step += 1 if train_step % args.log_period == 0: log_info = { # 'train: sum_loss': sum_loss, 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: advanage': advantage, # 'train: entropy': entropy, 'train: eps reward': pinned_eps_reward, # 'train: mean log prob': mean_log_prob } for tag, value in log_info.items (): logger.scalar_summary (tag, value, train_step)
def trainhoc(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = OC_env(args.env) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = HOCAgent(None, env, args, None) player.gpu_id = gpu_id player.model = HOCModel(player.env.observation_space.shape[0], player.env.action_space, args.options, args.width) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 threshold = 0 EnvNumSteps = 0 while True: if EnvNumSteps > threshold: threshold += 5000 print("thread:", rank, "steps:", EnvNumSteps) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: ### add in option selection part probo1, logpo1, player.o1 = player.model.getPolicyO1( Variable(player.state)) probo2, logpo2, player.o2 = player.model.getPolicyO2( Variable(player.state), player.o1) else: player.o1 = player.o1 player.o2 = player.o2 for step in range(args.num_steps): EnvNumSteps += 1 player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: q = player.model(Variable(player.state)) v = q.max(-1)[0] R = v.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = torch.zeros(1, 1) value_loss = torch.zeros(1, 1) phi_loss = torch.zeros(1, 1) gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) thesize = len(player.rewards) for i in reversed(range(len(player.rewards))): ### update discounted reward before = R R = args.gamma * R + player.rewards[i] ### update value function difference1 = R - player.qs1[i] value_loss = value_loss + 0.5 * difference1.pow(2) difference2 = R - player.qs2[i] value_loss = value_loss + 0.5 * difference2.pow(2) if i + 1 < thesize: difference3 = before - player.values[i + 1] difference4 = before - player.qs1[i + 1] ### update policy # adv1 = R - player.qs1[i] delta2 = R - player.qs2[i] policy_loss = policy_loss - \ player.log_probsa[i] * \ Variable(delta2) - 0.1 * player.entropiesA[i] if i + 1 < thesize: beta1 = player.termprobs1[i + 1].data beta2 = player.termprobs2[i + 1].data policy_loss = policy_loss - \ args.gamma * player.log_probso1[i+1] * \ Variable(beta1 * beta2 * difference3.data) - 0.1 * player.entropieso1[i+1] policy_loss = policy_loss - \ args.gamma * player.log_probso2[i+1] * \ Variable(beta2 * difference4.data) - 0.1 * player.entropieso2[i+1] advantage1 = player.qs1[i + 1].data - player.values[ i + 1].data + args.delib phi_loss = phi_loss + \ args.gamma * player.termprobs1[i+1] * \ Variable(advantage1 * beta2, requires_grad=False) advantage2 = player.qs2[ i + 1].data - (1 - beta1) * player.qs1[i + 1].data - ( beta1 * player.values[i + 1].data) + args.delib phi_loss = phi_loss + \ args.gamma * player.termprobs2[i+1] * \ Variable(advantage2, requires_grad=False) player.model.zero_grad() (phi_loss.sum() + policy_loss.sum() + 0.5 * value_loss.sum()).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = create_env(args.env, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == 'MLP': player.model = A3C_MLP(player.env.observation_space.shape[0], player.env.action_space, args.stack_frames) if args.model == 'CONV': player.model = A3C_CONV(args.stack_frames, player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = torch.zeros(1, 1).cuda() else: R = torch.zeros(1, 1) if not player.done: state = player.state if args.model == 'CONV': state = state.unsqueeze(0) value, _, _, _ = player.model( (Variable(state), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() else: gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion # print(player.rewards[i]) delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def act(self): self.model.load_state_dict(self.shared_model.state_dict()) self.model.train() log_probs, entropies, rewards, values = [], [], [], [] for _ in range(self.t_max): pout, vout = self.model.pi_and_v(self.state_var) reward = self.env.receive_action(pout.action_indices[0]) if self.clip_reward: reward = np.clip(reward, -1, 1) log_probs.append(pout.sampled_actions_log_probs) entropies.append(pout.entropy) values.append(vout) rewards.append(reward) if self.env.is_terminal: break self.update_state() R = 0 if not self.env.is_terminal: _, vout = self.model.pi_and_v(self.state_var, keep_same_state=True) R = float(vout.data.numpy()) else: self.env.reset() self.model.reset_state() self.update_state() t = len(rewards) pi_loss, v_loss = 0, 0 for i in reversed(range(t)): R = self.gamma*R + rewards[i] v = values[i] advantage = R - float(v.data.numpy()[0, 0]) # Accumulate gradients of policy log_prob = log_probs[i] entropy = entropies[i] # Log probability is increased proportionally to advantage pi_loss -= log_prob * advantage # Entropy is maximized pi_loss -= self.beta * entropy # Accumulate gradients of value function v_loss += (v - R).pow(2).div_(2) if self.pi_loss_coef != 1.0: pi_loss *= self.pi_loss_coef if self.v_loss_coef != 1.0: v_loss *= self.v_loss_coef # Normalize the loss of sequences truncated by terminal states if self.keep_loss_scale_same and t < self.t_max: factor = self.t_max / t pi_loss *= factor v_loss *= factor total_loss = pi_loss + v_loss # Compute gradients using thread-specific model self.optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm(self.model.parameters(), 40) # Copy the gradients to the globally shared model ensure_shared_grads(self.model, self.shared_model) self.optimizer.step() self.model.unchain_backward() return t
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) _ = env.reset() action = env.action_space.sample() _, _, _, info = env.step(action) start_lives = info['ale.lives'] if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() env.seed(args.seed + rank) state = env.reset() state = torch.from_numpy(state).float() done = True episode_length = 0 current_life = start_lives while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 512)) hx = Variable(torch.zeros(1, 512)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, info = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length if args.count_lives: if current_life > info['ale.lives']: done = True else: current_life = info['ale.lives'] reward = max(min(reward, 1), -1) if done: episode_length = 0 current_life = start_lives state = env.reset() state = torch.from_numpy(state).float() values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def train(shared_model, optimizer, rank, global_steps, args): setproctitle('{}:train[{}]'.format(args.name, rank)) torch.manual_seed(args.seed + rank) torch.cuda.manual_seed(args.seed + rank) env = create_env(args.game_type, args.env_name, 'train:{}'.format(rank), args.remotes[rank]) env._max_episode_steps = args.max_episode_length env.seed(args.seed + rank) model = copy.deepcopy(shared_model) gpu_id = args.gpu_ids[rank] with torch.cuda.device(gpu_id): model = model.cuda() if gpu_id >= 0 else model model.train() optimizer = optimizer or optim.Adam(shared_model.parameters(), lr=args.lr) done = True try: while True: # Sync with the shared model with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) if done: with torch.cuda.device(gpu_id): state = torch.from_numpy(env.reset()).float() state = state.cuda() if gpu_id >= 0 else state model.reset() values, log_probs, rewards, entropies = [], [], [], [] for step in range(args.n_steps): with global_steps.get_lock(): global_steps.value += 1 value, logit = model(Variable(state.unsqueeze(0))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) raw_state, reward, done, _ = env.step(action.cpu().numpy()) reward = max(min(reward, args.max_reward), args.min_reward) values.append(value) log_probs.append(log_prob) rewards.append(reward) entropies.append(entropy) if done: break state = state.copy_(torch.from_numpy(raw_state).float()) R = state.new().resize_((1, 1)).zero_() if not done: value, _ = model(Variable(state.unsqueeze(0), volatile=True), keep_same_state=True) R = value.data values.append(Variable(R)) policy_loss, value_loss = 0, 0 R = Variable(R) gae = state.new().resize_((1, 1)).zero_() for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * values[ i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * Variable( gae) - 0.01 * entropies[i] model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0) optimizer.step() model.detach() if global_steps.value >= args.max_global_steps: break except Exception as e: raise finally: print('Trainer [{}] finished !'.format(rank))
def train_rep(args, shared_model, env_conf): batch_size = 16 train_times = args.rep_train_time trace = [] td_class = [(0, 1), (1, 2), (2, 3), (3, 5), (5, 7), (7, 9)] loss_fn = nn.CrossEntropyLoss() optimizer_r = Adam(shared_model.r_net.parameters(), lr=args.rl_r) optimizer_c = Adam(shared_model.c_net.parameters(), lr=args.rl_r) ptitle('Train rep') gpu_id = args.gpu_ids[-1] torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() # player.model.r_net = player.model.r_net.cuda() # player.model.c_net = player.model.c_net.cuda() flag = True while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.train() flag = False player.action_test() trace.append(player.state) if len(trace) > args.trace_length: # 训练几百次 for _ in range(train_times): range_c = np.random.randint(0, len(td_class)) TD = np.random.randint(td_class[range_c][0], td_class[range_c][1]) begin = np.random.randint(0, len(trace) - TD - batch_size) former = torch.stack(trace[begin:begin + batch_size], dim=0) latter = torch.stack(trace[begin + TD:begin + TD + batch_size], dim=0) target = torch.zeros(batch_size, dtype=torch.long) + range_c if gpu_id >= 0: with torch.cuda.device(gpu_id): former = former.cuda() latter = latter.cuda() target = target.cuda() rep_f, rep_l = player.model.r_net(former), player.model.r_net( latter) output = player.model.c_net(rep_f, rep_l, False) loss = loss_fn(output, target) optimizer_r.zero_grad() optimizer_c.zero_grad() loss.backward() ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) ensure_shared_grads(player.model.c_net, shared_model.c_net, gpu=gpu_id >= 0) optimizer_r.step() optimizer_c.step() trace = [] if player.done and not player.info: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True state = player.env.reset() time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf, iters, checkpoint_path): iters = dill.loads(iters) if args.enable_gavel_iterator and rank == 0: iters._init_logger() ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 elapsed_time = 0 start_time = time.time() for i in iters: if i % 100 == 0: print('GPU %d finished step %d' % (rank, i), flush=True) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() elapsed_time += time.time() - start_time start_time = time.time() if (args.throughput_estimation_interval is not None and i % args.throughput_estimation_interval == 0 and rank == 0): print('[THROUGHPUT_ESTIMATION]\t%s\t%d' % (time.time(), i)) if (args.max_duration is not None and elapsed_time >= args.max_duration): break if args.enable_gavel_iterator and rank == 0: state = shared_model.state_dict() iters.save_checkpoint(state, checkpoint_path) iters.complete()
def trainocpg(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = OC_env(args.env) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = OCPGAgent(None, env, args, None) player.gpu_id = gpu_id player.model = OCPGModel(player.env.observation_space.shape[0], player.env.action_space, args.options, args.width) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 threshold = 0 EnvNumSteps = 0 reward_mean = 0. while True: if EnvNumSteps > threshold: threshold += 5000 print("thread:", rank, "steps:", EnvNumSteps) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: ### add in option selection part q, logito = player.model(Variable(player.state)) probo = F.softmax(logito, dim=1) player.otensor = probo.multinomial(1).data player.o = player.otensor.numpy()[0][0] else: player.o = player.o for step in range(args.num_steps): EnvNumSteps += 1 player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) # if not player.done: q, logito = player.model(Variable(player.state)) v = q.max(-1)[0] R = v.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = torch.zeros(1, 1) value_loss = torch.zeros(1, 1) phi_loss = torch.zeros(1, 1) gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) thesize = len(player.rewards) reward_sum = sum(player.rewards) reward_mean = reward_mean + (reward_sum - thesize * reward_mean) / EnvNumSteps JPi = Variable(torch.tensor(reward_mean)) for i in reversed(range(len(player.rewards))): before = R R = args.gamma * R + player.rewards[i] - JPi difference = R - player.qs[i] if i + 1 < thesize: difference2 = before - player.values[i + 1] else: NextQ, NextLogito = player.model(Variable(player.state)) NextTerm = player.model.getTermination(Variable(player.state), player.o) NextProbso = F.softmax(NextLogito, dim=1) ### select new option otensor = NextProbso.multinomial(1).data NextLog_probso = F.log_softmax(NextLogito, dim=1) NextValue = NextQ.max(-1)[0] NextQ = NextQ[0][otensor.numpy()[0][0]] NextEntropyso = -(NextLog_probso * NextProbso).sum(1) NextLog_probso = NextLog_probso.gather(1, Variable(otensor)) difference2 = before - NextValue value_loss = value_loss + 0.5 * difference.pow(2) policy_loss = policy_loss - player.log_probs[i] * Variable( difference.data) - 0.1 * player.entropies[i] if i + 1 < thesize: beta = player.termprobs[i + 1].data policy_loss = policy_loss - args.gamma * beta * player.log_probso[ i + 1] * Variable( difference2.data) - 0.1 * player.entropieso[i + 1] ###!!!!! termination update advantage = player.qs[i + 1].data - player.values[ i + 1].data + args.delib phi_loss = phi_loss + args.gamma * player.termprobs[ i + 1] * Variable(advantage, requires_grad=False) else: beta = NextTerm.data policy_loss = policy_loss - args.gamma * beta * NextLog_probso * Variable( difference2.data) - 0.1 * NextEntropyso ###!!!!! termination update advantage = NextQ.data - NextValue.data + args.delib phi_loss = phi_loss + args.gamma * NextTerm * Variable( advantage, requires_grad=False) player.model.zero_grad() (phi_loss.sum() + policy_loss.sum() + 0.5 * value_loss.sum()).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if str(rank) == "1": fullname = args.save_model_dir + args.env + str(rank) + ".torch" tmpname = args.save_model_dir + args.env + str(rank) + ".tmp" torch.save(optimizer.state_dict(), tmpname) #optimizer.state_dict() os.rename(tmpname, fullname)
def train(rank, reward_type, args, shared_model, optimizer, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) env.seed(args.seed + rank) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, reward_type) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() for i in itertools.count(): if i % 10 == 0: print("reward type {0}, iter {1}".format(reward_type, i)) player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() reward_sum += player.reward if args.count_lives: player.check_state() if player.done: break if player.done: num_tests += 1 player.current_life = 0 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf, shared_counter, targ_shared): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') torch.manual_seed(args.seed + rank) torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.state = torch.from_numpy(player.state).to(torch.float32) player.state = player.state.to(device) player.model = player.model.to(device) #player.targ_model = copy.deepcopy(player.model) player.model.train() #player.targ_model.eval() player.eps_len += 2 while True: player.model.load_state_dict(shared_model.state_dict()) #player.targ_model.load_state_dict(targ_shared.state_dict()) if player.done: player.cx = torch.zeros(1, 512).to(device) player.hx = torch.zeros(1, 512).to(device) #player.targ_cx = copy.deepcopy(player.cx).detach() #player.targ_hx = copy.deepcopy(player.hx).detach() else: player.cx = player.cx.detach() player.hx = player.hx.detach() for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) #alpha = player.model.log_alpha.exp().detach() alpha = .01 #alpha = 0 x_R = torch.zeros(1, 1) if not player.done: with torch.no_grad(): action, value, logit, q_value, _ = player.model( (player.state.unsqueeze(0), (player.hx, player.cx))) x_R = q_value[1].detach() - alpha * F.log_softmax( logit, -1).gather(-1, action) x_R = x_R.to(device) policy_loss = 0 adv_gae_loss = 0 for i in reversed(range(len(player.rewards))): x_R = args.gamma * x_R + player.rewards[i] adv_gae_loss = adv_gae_loss + (player.tra_adv_gae[i][1] - x_R.detach()).pow(2) * .5 #policy_loss = policy_loss - player.log_probs[i] * player.tra_adv_gae[i][0].detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() policy_loss = policy_loss - (F.softmax( player.values[i], -1) * player.tra_adv_gae[i][0].detach()).sum( -1) - alpha * player.entropies[i].unsqueeze(-1) #policy_loss = policy_loss - player.log_probs[i] * (x_R - (F.softmax(player.values[i], -1) * # player.tra_adv_gae[i][0]).sum(-1) - alpha * player.entropies[i]).detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() #prob = F.softmax(player.values[i], -1) #ent_alpha = alpha * player.entropies[i].unsqueeze(-1) #advs = (player.tra_adv_gae[i][0] - # ((player.tra_adv_gae[i][0] * prob).sum(-1, True) + # ent_alpha)).detach() #policy_loss = policy_loss - (prob * advs).sum(-1) - ent_alpha x_R = x_R - alpha * player.log_probs[i].detach() player.model.zero_grad() (policy_loss + .5 * adv_gae_loss).backward(retain_graph=False) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() with shared_counter.get_lock(): shared_counter.value += len(player.rewards) if shared_counter.value > args.interact_steps: break
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = Environment() # 創建環境 if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) # env.seed(args.seed + rank) player = Agent(None, env, args, None) # 創建代理人 player.gpu_id = gpu_id num_actions = env.get_num_actions() player.model = A3Clstm( Config.STACKED_FRAMES, # A3C模型 num_actions) player.state, available = player.env.reset() # 初始環境 player.state = torch.from_numpy(player.state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.available = player.available.cuda() player.model.train() # 訓練模式 player.eps_len += 1 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) # 更新網路 if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) # 完成一次訓練 初始化LSTM else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): # T-max = 20 player.action_train() if player.done: break if player.done: state, available = player.env.reset() player.state = torch.from_numpy(state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.available = player.available.cuda() R = torch.zeros(1, 1) # if done : R_t-max = 0 if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data # R_t-max = V(s) if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train_rollout(self, total_step): storage = Storage(self.episode_C['rollout_length']) state = self.env._copy_state(*self.state) step_times = [] # Sync. self.gnn.load_state_dict(self.shared_gnn.state_dict()) for rollout_step in range(self.episode_C['rollout_length']): start_step_time = time.time() prediction = self.env.propagate(self.gnn, [state]) action = prediction['a'].cpu().numpy()[0] next_state, reward, done, achieved_goal = self.env.step(action, self.ep_step, state) self.ep_step += 1 if done: # Sync local model with shared model at start of each ep self.gnn.load_state_dict(self.shared_gnn.state_dict()) self.ep_step = 0 storage.add(prediction) storage.add({'r': tensor(reward, self.device).unsqueeze(-1).unsqueeze(-1), 'm': tensor(1 - done, self.device).unsqueeze(-1).unsqueeze(-1), 's': state}) state = self.env._copy_state(*next_state) total_step += 1 end_step_time = time.time() step_times.append(end_step_time - start_step_time) self.state = self.env._copy_state(*state) prediction = self.env.propagate(self.gnn, [state]) storage.add(prediction) storage.placeholder() advantages = tensor(np.zeros((1, 1)), self.device) returns = prediction['v'].detach() for i in reversed(range(self.episode_C['rollout_length'])): # Disc. Return returns = storage.r[i] + self.agent_C['discount'] * storage.m[i] * returns # GAE td_error = storage.r[i] + self.agent_C['discount'] * storage.m[i] * storage.v[i + 1] - storage.v[i] advantages = advantages * self.agent_C['gae_tau'] * self.agent_C['discount'] * storage.m[i] + td_error storage.adv[i] = advantages.detach() storage.ret[i] = returns.detach() # print(returns.shape, td_error.shape, advantages.shape, storage.adv[-1].shape, storage.ret[-1].shape) actions, log_probs_old, returns, advantages = storage.cat(['a', 'log_pi_a', 'ret', 'adv']) states = [storage.s[i] for i in range(storage.size)] actions = actions.detach() log_probs_old = log_probs_old.detach() advantages = (advantages - advantages.mean()) / advantages.std() # Train self.gnn.train() batch_times = [] train_pred_times = [] for _ in range(self.agent_C['optimization_epochs']): # Sync. at start of each epoch self.gnn.load_state_dict(self.shared_gnn.state_dict()) sampler = random_sample(np.arange(len(states)), self.agent_C['minibatch_size']) for batch_indices in sampler: start_batch_time = time.time() batch_indices_tensor = tensor(batch_indices, self.device).long() # Important Node: these are tensors but dont have a grad sampled_states = [states[i] for i in batch_indices] sampled_actions = actions[batch_indices_tensor] sampled_log_probs_old = log_probs_old[batch_indices_tensor] sampled_returns = returns[batch_indices_tensor] sampled_advantages = advantages[batch_indices_tensor] start_pred_time = time.time() prediction = self.env.propagate(self.gnn, sampled_states, sampled_actions) end_pred_time = time.time() train_pred_times.append(end_pred_time - start_pred_time) # Calc. Loss ratio = (prediction['log_pi_a'] - sampled_log_probs_old).exp() obj = ratio * sampled_advantages obj_clipped = ratio.clamp(1.0 - self.agent_C['ppo_ratio_clip'], 1.0 + self.agent_C['ppo_ratio_clip']) * sampled_advantages # policy loss and value loss are scalars policy_loss = -torch.min(obj, obj_clipped).mean() - self.agent_C['entropy_weight'] * prediction['ent'].mean() value_loss = self.agent_C['value_loss_coef'] * (sampled_returns - prediction['v']).pow(2).mean() self.opt.zero_grad() (policy_loss + value_loss).backward() if self.agent_C['clip_grads']: nn.utils.clip_grad_norm_(self.gnn.parameters(), self.agent_C['gradient_clip']) ensure_shared_grads(self.gnn, self.shared_gnn) self.opt.step() end_batch_time = time.time() batch_times.append(end_batch_time - start_batch_time) self.gnn.eval() return total_step, np.array(step_times).mean(), np.array(batch_times).mean(), np.array(train_pred_times).mean()
def trainac(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = OC_env(args.env) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = ACAgent(None, env, args, None) player.gpu_id = gpu_id player.model = ACModel(player.env.observation_space.shape[0], player.env.action_space, args.options, args.width) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 threshold = 0 EnvNumSteps = 0 reward_mean = 0. while True: if EnvNumSteps > threshold: threshold += 5000 print("thread:", rank, "steps:", EnvNumSteps) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): EnvNumSteps += 1 player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: q, logit = player.model(Variable(player.state)) v = q.max(-1)[0] R = v.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = torch.zeros(1, 1) value_loss = torch.zeros(1, 1) gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) thesize = len(player.rewards) reward_sum = sum(player.rewards) reward_mean = reward_mean + (reward_sum - thesize * reward_mean) / EnvNumSteps for i in reversed(range(len(player.rewards))): before = R R = args.gamma * R + player.rewards[i] difference = R - player.qs[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * difference.pow(2) policy_loss = policy_loss - player.log_probs[i] * Variable( advantage.data) - 0.1 * player.entropies[i] player.model.zero_grad() (policy_loss.sum() + 0.5 * value_loss.sum()).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if str(rank) == "1": fullname = args.save_model_dir + args.env + str(rank) + ".torch" tmpname = args.save_model_dir + args.env + str(rank) + ".tmp" torch.save(optimizer.state_dict(), tmpname) #optimizer.state_dict() os.rename(tmpname, fullname)
def train(rank, args, shared_model, optimizer, env_conf, num_tau_samples=32, num_tau_prime_samples=32, kappa=1.0, num_quantiles=32): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam( shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1,num_tau_prime_samples) if not player.done: logit, _, _ = player.model((Variable( player.state.unsqueeze(0)), (player.hx, player.cx))) q_vals = torch.mean(logit,0) _, action = torch.max(q_vals,0) logit, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = logit[:,action] if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() #R = R.detach() R = Variable(R) value_loss = 0 for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R.repeat(num_tau_samples,1) - player.logits_array[i].repeat(1, num_tau_prime_samples) #print("Ad: ",advantage) loss = (torch.abs(advantage) <= kappa).float() * 0.5 * advantage ** 2 #print("loss: ",loss.sum(0).sum(0), loss) loss += (torch.abs(advantage) > kappa).float() * kappa * (torch.abs(advantage) - 0.5 * kappa) #print("loss: ",loss.sum(0).sum(0), loss) step_loss = torch.abs(player.quantiles_array[i].cuda() - (advantage.detach()<0).float()) * loss/kappa value_loss += step_loss.sum(0).mean(0) player.model.zero_grad() value_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock, counter): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] player.hx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] else: player.cx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] player.hx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] else: player.cx = [ Variable(player.cx[0].data), Variable(player.cx[1].data) ] player.hx = [ Variable(player.hx[0].data), Variable(player.cx[1].data) ] # 测试rnet的更新有没有影响到这里 # ps = list(player.model.r_net.named_parameters()) # n, v = ps[6] # print(v.sum()) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1]))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] with lock: counter.value += 1 # rnet player.model.r_net.zero_grad() (args.actor_weight * policy_loss + (1 - args.actor_weight) * value_loss).backward(retain_graph=True) ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) optimizer_r.step() player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() player.model.r_net.zero_grad() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() while True: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() if args.count_lives: player.check_state() if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model) optimizer.step() player.clear_actions()
def train_func(rank, args, shared_model, optimizer, env_conf, datasets=None, shared_dict=None): if args.deploy: return ptitle('Train {0}'.format(rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir[:-1] + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list, seed=args.seed + rank) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env.observation_space.shape, args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_reward.mean() eps_reward = 0 if args.lstm_feats: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=True) else: player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=False) elif args.lstm_feats: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): if rank < args.lbl_agents: player.action_train(use_lbl=True) else: player.action_train() if rank == 0: eps_reward = player.env.sum_reward.mean() if player.done: break if player.done: state = player.env.reset(player.model, gpu_id) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if "3D" in args.data: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.lowres: R = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if not player.done: if args.lstm_feats: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 if "3D" in args.data: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.rew_drop: keep_map = torch.tensor(player.env.keep_map) if args.lowres: gae = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() if args.rew_drop: keep_map = keep_map.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i if args.rew_drop: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage * keep_map).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t else: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if args.noisy: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () else: if args.rew_drop: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae) * keep_map).mean () - \ (args.entropy_alpha * player.entropies[i] * keep_map).mean () else: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) curtime = time.time() # print ("backward curtime:", curtime) sum_loss.backward() # print ("backward done", time.time () - curtime) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) curtime = time.time() # print ("optim curtime:", curtime) optimizer.step() # print ("optim done", time.time () - curtime) player.clear_actions() if args.wctrl == "s2m": player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] if rank == 0: train_step += 1 if train_step % args.log_period == 0 and train_step > 0: log_info = { 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: eps reward': pinned_eps_reward, } if "EX" in args.model: log_info["cell_prob_loss"] = cell_prob_loss for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def train(rank, args, shared_model, optimizer, env_conf): start_time = time.time() ptitle('Training Agent: {}'.format(rank)) #log = {} #setup_logger('{}_train_log'.format(args.env), r'{0}{1}_train_log'.format( # args.log_dir, args.env)) #log['{}_train_log'.format(args.env)] = logging.getLogger( # '{}_train_log'.format(args.env)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) if 'micropolis' in args.env.lower(): env = micropolis_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_sizes = player.model.getMemorySizes() else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) lstm_size = 512 if 'micropolis' in args.env.lower(): if 'arcade' not in args.env.lower(): lstm_size = (1, 16, env.env.env.MAP_X, env.env.env.MAP_Y) player.lstm_size = lstm_size player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 log_counter = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) num_lstm_layers = len(player.lstm_sizes) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(player.lstm_sizes[i]).cuda()) for i in range(num_lstm_layers) ] player.hx = [ Variable(torch.zeros(player.lstm_sizes[i]).cuda()) for i in range(num_lstm_layers) ] else: player.cx = [ Variable(torch.zeros(lstm_sizes[i])) for i in range(num_lstm_layers) ] player.hx = [ Variable(torch.zeros(lstm_sizes[i])) for i in range(num_lstm_layers) ] else: player.cx = [ Variable(player.cx[i].data) for i in range(num_lstm_layers) ] player.hx = [ Variable(player.hx[i].data) for i in range(num_lstm_layers) ] for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if args.randomize_exploration: player.certainty = np.random.uniform(0.5, 1.5) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: values, logit, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) if values.size()[1] == 1: value = values else: prob = torch.nn.functional.softmax(logit, dim=1) action = prob.multinomial(1).data value = values[0][action] R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() R = Variable(R).cuda() else: R = Variable(R) player.values.append(R) policy_loss = 0 value_loss = 0 for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): player.rewards[i] = torch.Tensor([player.rewards[i] ]).cuda() R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = Variable(gae.cuda()) else: gae = Variable(gae) policy_loss = policy_loss - \ player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i] #if log_counter % 10 == 0: # log['{}_train_log'.format(args.env)].info( # "Time {0}, reward {1}, policy loss {2}, value loss {3}, entropy {4}". # format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), # '{:9.2e}'.format(float(sum(player.rewards) / len(player.rewards))), # '{:9.2e}'.format(float(policy_loss.data.item())), # '{:9.2e}'.format(float(value_loss.data.item())), # '{:10.8e}'.format(float(sum(player.entropies))))) #log_counter += 1 optimizer.zero_grad() a3c = args.lmbda * (policy_loss + 0.5 * value_loss) a3c.backward() torch.nn.utils.clip_grad_norm_(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) print("prank:", rank, "os.pid:", os.getpid()) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = AllowBacktracking( make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("Got a local env; obs space:", env.observation_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() print("player.state.shape:", player.state.shape) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: # if player.info['ale.lives'] == 0 or player.max_length: # player.eps_len = 0 state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf, emb, bi_grams, instructions): # Changes the process name ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) # Define special vectors eos_vector = emb.get_vector("<eos>") oov_vector = emb.get_vector("<oov>") if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam( shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) # Create agent player = Agent(None, env, args, None, emb) player.gpu_id = gpu_id # Create DNN model for the agent player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space, emb) # Set env and move to gpu player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() # Set model to "training" mode. Not doing anything but is a good practice to add player.model.train() # Start iteration player.eps_len += 2 _counter = 0 while True: # Loading param values from shared model if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) # Reset LSTM state when episode ends if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, args.lstm_size).cuda()) player.hx = Variable(torch.zeros(1, args.lstm_size).cuda()) else: player.cx = Variable(torch.zeros(1, args.lstm_size)) player.hx = Variable(torch.zeros(1, args.lstm_size)) # If not ended, save current state value else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) # Make a step and record observations. Repeat until num_steps reached or game is over. for step in range(args.num_steps): player.action_train() if player.done: break # If episode finished before args.num_steps is reached, reset environment if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() # If episode not finished after args.num_steps: # Estimates value function of current state R = torch.zeros(1, 1) if not player.done: _, value, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() # Append reward for the final time step player.values.append(Variable(R)) # Initialise loss accumulator policy_loss = 0 value_loss = 0 language_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) # Accumulate the losses for i in reversed(range(len(player.rewards))): # Calculating language loss if args.use_language: # Calculating language loss # Get action of a time step a = np.argmax(player.action_logits[i].detach().cpu().numpy()) # Get produced vectors of the time step produced_logits = player.produced_logits[i] # print(produced_vectors) # Get target vectors of the time step (an instruction corresponding to the least cost) action_instructions = instructions[a] # Sample a few from the set for _ in range(10): idx = random.randrange(0, len(action_instructions)) instruction = action_instructions[idx] target_words = instruction.split() for pos, target_word in enumerate(target_words): target_class = torch.tensor(emb.get_index(target_word)).cuda() produced_logit = produced_logits[pos] # Cross_entropy combines log-softmax and nll # Here procuded_vec is one-hot while target is an integer language_loss += torch.nn.functional.cross_entropy(produced_logit, target_class.unsqueeze(0)) if target_word == '<eos>': break # Calculate other losses R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] # Initialise grad accumulator player.model.zero_grad() # Calculate grad and update if args.use_language: (policy_loss + 0.5 * value_loss + 0.1 * 0.01* language_loss).backward() else: (policy_loss + 0.5 * value_loss).backward() """ # (policy_loss + 0.5 * value_loss).backward() print("****************") print(policy_loss) print(value_loss) # """ if args.use_language and _counter % 10 == 0: print("****************") #print(policy_loss) #print(value_loss) print("language loss", language_loss) _counter += 1 # Copying over the parameters to shared model ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() # Clean agent observations player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) state = env.reset() player = Agent(model, env, args, state) player.state = torch.from_numpy(state).float() player.model.train() epoch = 0 while True: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) if player.starter: player = player_start(player, train=True) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player = player_act(player, train=True) if player.done: break if player.current_life > player.info['ale.lives']: player.flag = True player.current_life = player.info['ale.lives'] else: player.current_life = player.info['ale.lives'] player.flag = False if args.count_lives: if player.flag: player.done = True break if player.starter and player.flag: player = player_start(player, train=True) if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() player.flag = False R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss += 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + value_loss).backward() ensure_shared_grads(player.model, shared_model) optimizer.step() player.values = [] player.log_probs = [] player.rewards = [] player.entropies = []
def train_worker(args, shared_model, total_steps, optimizer, lock): env = make_env(args) args = args.train model = ActorCritic(env.observation_space.shape, env.action_space.n) model.train() state = env.reset() state = torch.FloatTensor(state) while True: model.load_state_dict(shared_model.state_dict()) model.detach_hidden() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.update_agent_frequency): value, logit = model(state.unsqueeze(0)) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) with total_steps.get_lock(): total_steps.value += 1 if done: state = env.reset() model.reset_hidden() state = torch.FloatTensor(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _ = model(state.unsqueeze(0)) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach( ) - args.entropy_weight * entropies[i] optimizer.zero_grad() (policy_loss + args.value_weight * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) with lock: ensure_shared_grads(model, shared_model) optimizer.step()