def test(args, shared_model, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_{2}workers_log'.format(args.log_dir, args.env, args.workers)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = atari_env(args.env, env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.eval() while True: if player.done: player.model.load_state_dict(shared_model.state_dict()) player.action_test() reward_sum += player.reward if player.done: num_tests += 1 player.current_life = 0 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if reward_sum > args.save_score_level: player.model.load_state_dict(shared_model.state_dict()) state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
def test(self, iteration, show='none', save_max=False): env = create_env(self.args) player = Agent(None, env, self.args, None) player.gpu_id = self.gpu_id if self.args.model == 'MLP': player.model = A3C_MLP( player.env.observation_space.shape[0], player.env.action_space, self.args.stack_frames) if self.args.model == 'CONV': player.model = A3C_CONV(self.args.stack_frames, player.env.action_space) # load the input model if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): player.model.load_state_dict(self.shared_model.state_dict()) else: player.model.load_state_dict(self.shared_model.state_dict()) player.state = player.env.reset(self.args) player.state = torch.from_numpy(player.state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() while True: player.action_test() if self.args.show != 'none' or show != 'none': player.env.render() self.reward_sum += player.reward if player.done: self.num_tests += 1 self.reward_total_sum += self.reward_sum reward_mean = self.reward_total_sum / self.num_tests self.reward_sum = 0 player.eps_len = 0 state = player.env.reset(self.args) player.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): player.state = player.state.cuda() if self.args.show != 'none' or show != 'none': player.env.close() break return self.reward_total_sum
def test(args, shared_model, env_conf): # print('IN TEST') ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) setup_logger('{}_map_log'.format(args.env), r'{0}{1}_map_log'.format(args.log_dir, args.env)) log['{}_map_log'.format(args.env)] = logging.getLogger('{}_map_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) if 'micropolis' in args.env.lower(): import gym_micropolis env = micropolis_env(args.env, env_conf, args) else: # print('using atari env for test') env = atari_env(args.env, env_conf, args) reward_sum = 0 entropy_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_sizes = player.model.getMemorySizes() if not 'arcade' in args.env.lower(): player.lstm_size = (1, 16, player.env.env.env.MAP_X, env.env.env.MAP_Y) else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 i = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward entropy_sum += player.entropy.data.item() if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1:1.5e}, entropy {4:1.5e} episode length {2}, reward mean {3:1.5e}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, entropy_sum)) import numpy as np np.set_printoptions(threshold=400) log['{}_map_log'.format(args.env)].info('\n{}'.format( np.array2string( np.add( player.env.env.env.micro.map.zoneMap[-1], np.full((player.env.env.env.MAP_X, player.env.env.env.MAP_Y), 2))).replace('\n ', '').replace('][', ']\n[').replace( '[[', '[').replace(']]', ']'))) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) if i % 10 == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 entropy_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 i += 1 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf, shared_counter, targ_shared): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') torch.manual_seed(args.seed + rank) torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.state = torch.from_numpy(player.state).to(torch.float32) player.state = player.state.to(device) player.model = player.model.to(device) #player.targ_model = copy.deepcopy(player.model) player.model.train() #player.targ_model.eval() player.eps_len += 2 while True: player.model.load_state_dict(shared_model.state_dict()) #player.targ_model.load_state_dict(targ_shared.state_dict()) if player.done: player.cx = torch.zeros(1, 512).to(device) player.hx = torch.zeros(1, 512).to(device) #player.targ_cx = copy.deepcopy(player.cx).detach() #player.targ_hx = copy.deepcopy(player.hx).detach() else: player.cx = player.cx.detach() player.hx = player.hx.detach() for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) #alpha = player.model.log_alpha.exp().detach() alpha = .01 #alpha = 0 x_R = torch.zeros(1, 1) if not player.done: with torch.no_grad(): action, value, logit, q_value, _ = player.model( (player.state.unsqueeze(0), (player.hx, player.cx))) x_R = q_value[1].detach() - alpha * F.log_softmax( logit, -1).gather(-1, action) x_R = x_R.to(device) policy_loss = 0 adv_gae_loss = 0 for i in reversed(range(len(player.rewards))): x_R = args.gamma * x_R + player.rewards[i] adv_gae_loss = adv_gae_loss + (player.tra_adv_gae[i][1] - x_R.detach()).pow(2) * .5 #policy_loss = policy_loss - player.log_probs[i] * player.tra_adv_gae[i][0].detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() policy_loss = policy_loss - (F.softmax( player.values[i], -1) * player.tra_adv_gae[i][0].detach()).sum( -1) - alpha * player.entropies[i].unsqueeze(-1) #policy_loss = policy_loss - player.log_probs[i] * (x_R - (F.softmax(player.values[i], -1) * # player.tra_adv_gae[i][0]).sum(-1) - alpha * player.entropies[i]).detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() #prob = F.softmax(player.values[i], -1) #ent_alpha = alpha * player.entropies[i].unsqueeze(-1) #advs = (player.tra_adv_gae[i][0] - # ((player.tra_adv_gae[i][0] * prob).sum(-1, True) + # ent_alpha)).detach() #policy_loss = policy_loss - (prob * advs).sum(-1) - ent_alpha x_R = x_R - alpha * player.log_probs[i].detach() player.model.zero_grad() (policy_loss + .5 * adv_gae_loss).backward(retain_graph=False) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() with shared_counter.get_lock(): shared_counter.value += len(player.rewards) if shared_counter.value > args.interact_steps: break
def train(rank, args, shared_model, optimizer, train_modes, n_iters, env=None): n_iter = 0 writer = SummaryWriter(os.path.join(args.log_dir, 'Agent:{}'.format(rank))) ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) training_mode = args.train_mode env_name = args.env train_modes.append(training_mode) n_iters.append(n_iter) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) device = torch.device('cuda:' + str(gpu_id)) if len(args.gpu_ids) > 1: device_share = torch.device('cpu') else: device_share = torch.device('cuda:' + str(args.gpu_ids[-1])) else: device = device_share = torch.device('cpu') if env is None: env = create_env(env_name, args) if args.train_mode == 0: params = shared_model.player0.parameters() elif args.train_mode == 1: params = shared_model.player1.parameters() else: params = shared_model.parameters() if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(params, lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(params, lr=args.lr) env.seed(args.seed) player = Agent(None, env, args, None, device) player.w_entropy_target = args.entropy_target player.gpu_id = gpu_id # prepare model player.model = build_model(player.env.observation_space, player.env.action_space, args, device) player.model = player.model.to(device) player.model.train() player.reset() reward_sum = torch.zeros(player.num_agents).to(device) reward_sum_org = np.zeros(player.num_agents) ave_reward = np.zeros(2) ave_reward_longterm = np.zeros(2) count_eps = 0 while True: # sys to the shared model player.model.load_state_dict(shared_model.state_dict()) if player.done: player.reset() reward_sum = torch.zeros(player.num_agents).to(device) reward_sum_org = np.zeros(player.num_agents) count_eps += 1 player.update_rnn_hiden() t0 = time.time() for i in range(args.num_steps): player.action_train() reward_sum += player.reward reward_sum_org += player.reward_org if player.done: for j, r_i in enumerate(reward_sum): writer.add_scalar('train/reward_' + str(j), r_i, player.n_steps) break fps = i / (time.time() - t0) # cfg training mode # 0: tracker 1: target -1:joint all training_mode = train_modes[rank] policy_loss, value_loss, entropies, pred_loss = player.optimize( params, optimizer, shared_model, training_mode, device_share) for i in range(min(player.num_agents, 3)): writer.add_scalar('train/policy_loss_' + str(i), policy_loss[i].mean(), player.n_steps) writer.add_scalar('train/value_loss_' + str(i), value_loss[i], player.n_steps) writer.add_scalar('train/entropies' + str(i), entropies[i].mean(), player.n_steps) writer.add_scalar('train/pred_R_loss', pred_loss, player.n_steps) writer.add_scalar('train/ave_reward', ave_reward[0] - ave_reward_longterm[0], player.n_steps) writer.add_scalar('train/mode', training_mode, player.n_steps) writer.add_scalar('train/fps', fps, player.n_steps) n_iter += 1 n_iters[rank] = n_iter if train_modes[rank] == -100: env.close() break
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() while True: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() if args.count_lives: player.check_state() if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model) optimizer.step() player.clear_actions()
saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format(args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) env = atari_env("{}".format(args.env), env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) num_tests = 0 reward_total_sum = 0 player = Agent(model, env, args, state=None) player.env = gym.wrappers.Monitor(player.env, "{}_monitor".format(args.env), force=True) player.model.eval() for i_episode in range(args.num_episodes): state = player.env.reset() player.state = torch.from_numpy(state).float() player.eps_len = 0 reward_sum = 0 while True: if args.render: if i_episode % args.render_freq == 0: player.env.render() if player.starter and player.flag: player = player_start(player)
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock, counter): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] player.hx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] else: player.cx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] player.hx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] else: player.cx = [ Variable(player.cx[0].data), Variable(player.cx[1].data) ] player.hx = [ Variable(player.hx[0].data), Variable(player.cx[1].data) ] # 测试rnet的更新有没有影响到这里 # ps = list(player.model.r_net.named_parameters()) # n, v = ps[6] # print(v.sum()) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1]))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] with lock: counter.value += 1 # rnet player.model.r_net.zero_grad() (args.actor_weight * policy_loss + (1 - args.actor_weight) * value_loss).backward(retain_graph=True) ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) optimizer_r.step() player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() player.model.r_net.zero_grad() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test(args, shared_model, env_conf, lock, counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger( '{}_log'.format(args.env), r'{0}{1}-{2}_log'.format(args.log_dir, args.env, args.log_target)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests with lock: counter.value += 1 log['{}_log'.format(args.env)].info( "UpdateStep {0} Time {1}, episode reward {2}, episode length {3}, reward mean {4:.4f}" .format( counter.value, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def test(rank, args, shared_model): writer = SummaryWriter('8_27_test') model_buffer = Model_Buffer(args) test_episodes = args.test_episodes ptitle('Test Agent') log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) print("logfile check", r'{0} {1}_log'.format(args.log_dir, args.env)) print("logs in test", args.log_dir) log['{}_log'.format(args.env)] = logging.getLogger( # 将logger放进字典 '{}_log'.format(args.env)) d_args = vars(args) # vars() 函数返回对象object的属性和属性值的字典对象。 for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, d_args[k])) # 输出参数信息 # for i in range(100): # log['{}_log'.format(args.env)].info('{0}'.format(i)) # print('we prefix seed = -1 when testing') # args.seed = -1 torch.manual_seed(args.seed) env = create_env(args.env, args.seed) # env = gym.make(args.env) # env.seed(args.seed) start_time = time.time() num_tests = 0 # 当前玩的回合数 player = Agent(None, env, args, None, rank) player.model = A3C_MLP(player.env.observation_space, player.env.action_space, args.stack_frames) # 设置model player.state = player.env.reset() # 设置state player.state = torch.from_numpy(player.state).float() player.done = True player.model.eval() # 设为eval模式 is_model_empty = True is_testing = False while True: model_buffer.put(shared_model) # 测试够一大回合,初始化 if player.done and np.mod(num_tests, test_episodes) == 0 and not is_testing: reward_episode = 0 success_rate = 0 load_model = model_buffer.get() # 获取公共model model_queue_size = model_buffer.qsize() if load_model: is_testing = True is_model_empty = False training_steps = load_model[1] training_episodes = load_model[2] # 用公共model实例化player_model(传入参数) load_model[0]保存的参数 player.model.load_state_dict(load_model[0]) else: is_model_empty = True # 未获取到model time.sleep(10) if not is_model_empty: player.action_test() # log['{}_log'.format(args.env)].info("test steps {}".format(1)) reward_episode += player.reward if 'is_success' in player.info.keys(): # 判断是否因成功而done success_rate += 1 if player.done: # 到达目标位置或撞毁或太远时为done,一回合结束 # print("crash detected") # eps_len_temp = player.eps_len #? num_tests += 1 # done时test回合数加一 player.eps_len = 0 # player这一回合所走的步数归零 state = player.env.reset() player.state = torch.from_numpy(state).float() if np.mod(num_tests, test_episodes) == 0: # 测试够一大回合,开始统计信息 is_testing = False reward_episode = reward_episode / test_episodes writer.add_scalar('success_num/Test', success_rate, training_steps) success_rate = success_rate / test_episodes log['{}_log'.format(args.env)].info( "Time {0}, training episodes {1}, training steps {2}, reward episode {3}, success_rate {4}, " "model cached {5}".format( time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), training_episodes, training_steps, reward_episode, success_rate, model_queue_size)) writer.add_scalar('success_rate/Test', success_rate, training_steps) # save model: state_to_save = player.model.state_dict() # torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) # torch.save(state_to_save, '{0}{1}_pre.dat'.format(args.save_model_dir, args.env)) torch.save(state_to_save, '{0}{1}.dat'.format(args.log_dir, args.env)) torch.save(state_to_save, '{0}{1}_pre.dat'.format(args.log_dir, args.env)) if training_steps > args.training_steps: break
def test(args, shared_model): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = create_env(args.env, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == 'MLP': player.model = A3C_MLP( player.env.observation_space.shape[0], player.env.action_space, args.stack_frames) if args.model == 'CONV': player.model = A3C_CONV(args.stack_frames, player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() while True: if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.action_test() reward_sum += player.reward if player.done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if reward_sum > args.save_score_level: player.model.load_state_dict(shared_model.state_dict()) state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train_func(rank, args, shared_model, optimizer, env_conf, datasets): if args.deploy: return ptitle('Train {0}'.format(rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir[:-1] + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = Debug_env(datasets, env_conf, seed=args.seed + rank) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None) player.gpu_id = gpu_id nChan = 3 if args.is3D: nChan = 4 if args.alpha_only: nChan = 1 if not args.is3D: player.model = get_model(args, "ENet", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) elif not args.obs3D: player.model = get_model(args, "ENet", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) elif args.obs3D: player.model = get_model(args, "Net3D", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_rewards.mean() eps_reward = 0 for step in range(args.num_steps): player.action_train() if rank == 0: eps_reward = player.env.sum_rewards.mean() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if not args.alpha_only: if not args.is3D: R = torch.zeros(1, 1, args.num_actions * 3) else: R = torch.zeros(1, 1, args.num_actions * 4) else: R = torch.zeros(1, 1, args.num_actions) if not player.done: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 if not args.alpha_only: if not args.is3D: gae = torch.zeros(1, 1, args.num_actions * 3) else: gae = torch.zeros(1, 1, args.num_actions * 4) else: gae = torch.zeros(1, 1, args.num_actions) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) curtime = time.time() sum_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) curtime = time.time() optimizer.step() player.clear_actions() if rank == 0: train_step += 1 if train_step % args.log_period * 10 == 0 and train_step > 0: log_info = { 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: eps reward': pinned_eps_reward, } for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def test(args, shared_model, optimizer, train_modes, n_iters): ptitle('Test Agent') n_iter = 0 writer = SummaryWriter(os.path.join(args.log_dir, 'Test')) gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}/logger'.format(args.log_dir)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) device = torch.device('cuda:' + str(gpu_id)) else: device = torch.device('cpu') env = create_env(args.env, args) env.seed(args.seed) start_time = time.time() count_eps = 0 player = Agent(None, env, args, None, device) player.gpu_id = gpu_id player.model = build_model(player.env.observation_space, player.env.action_space, args, device).to(device) player.model.eval() max_score = -100 while True: AG = 0 reward_sum = np.zeros(player.num_agents) reward_sum_list = [] len_sum = 0 for i_episode in range(args.test_eps): player.model.load_state_dict(shared_model.state_dict()) player.reset() reward_sum_ep = np.zeros(player.num_agents) rotation_sum_ep = 0 fps_counter = 0 t0 = time.time() count_eps += 1 fps_all = [] while True: player.action_test() fps_counter += 1 reward_sum_ep += player.reward rotation_sum_ep += player.rotation if player.done: AG += reward_sum_ep[0] / rotation_sum_ep * player.num_agents reward_sum += reward_sum_ep reward_sum_list.append(reward_sum_ep[0]) len_sum += player.eps_len fps = fps_counter / (time.time() - t0) n_iter = 0 for n in n_iters: n_iter += n for i, r_i in enumerate(reward_sum_ep): writer.add_scalar('test/reward' + str(i), r_i, n_iter) fps_all.append(fps) writer.add_scalar('test/fps', fps, n_iter) writer.add_scalar('test/eps_len', player.eps_len, n_iter) break # player.max_length: ave_AG = AG / args.test_eps ave_reward_sum = reward_sum / args.test_eps len_mean = len_sum / args.test_eps reward_step = reward_sum / len_sum mean_reward = np.mean(reward_sum_list) std_reward = np.std(reward_sum_list) log['{}_log'.format(args.env)].info( "Time {0}, ave eps reward {1}, ave eps length {2}, reward step {3}, FPS {4}, " "mean reward {5}, std reward {6}, AG {7}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), np.around(ave_reward_sum, decimals=2), np.around(len_mean, decimals=2), np.around(reward_step, decimals=2), np.around(np.mean(fps_all), decimals=2), mean_reward, std_reward, np.around(ave_AG, decimals=2))) # save model if ave_reward_sum[0] >= max_score: print('save best!') max_score = ave_reward_sum[0] model_dir = os.path.join(args.log_dir, 'best.pth') else: model_dir = os.path.join(args.log_dir, 'new.pth'.format(args.env)) state_to_save = { "model": player.model.state_dict(), "optimizer": optimizer.state_dict() } torch.save(state_to_save, model_dir) time.sleep(args.sleep_time) if n_iter > args.max_step: env.close() for id in range(0, args.workers): train_modes[id] = -100 break
def test(args, shared_model, env_conf, datasets=None, hasLbl=True): if hasLbl: ptitle('Valid agent') else: ptitle("Test agent") gpu_id = args.gpu_ids[-1] env_conf["env_gpu"] = gpu_id log = {} logger = Logger(args.log_dir) setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) if hasLbl: for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) if "EM_env" in args.env: raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list) else: env = Voronoi_env(env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == "UNet": player.model = UNet(env.observation_space.shape[0], args.features, 2) elif args.model == "FusionNetLstm": player.model = FusionNetLstm(env.observation_space.shape, args.features, 2, args.hidden_feat) elif args.model == "FusionNet": player.model = FusionNet(env.observation_space.shape[0], args.features, 2) elif (args.model == "UNetLstm"): player.model = UNetLstm(env.observation_space.shape, args.features, 2, args.hidden_feat) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() flag = True create_dir(args.save_model_dir) recent_episode_scores = [] renderlist = [] renderlist.append(player.env.render()) max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() renderlist.append(player.env.render()) if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests if hasLbl: log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores += [reward_sum] if len(recent_episode_scores) > 200: recent_episode_scores.pop(0) if args.save_max and np.mean(recent_episode_scores) >= max_score: max_score = np.mean(recent_episode_scores) if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env + '_' + str(num_tests))) if num_tests % args.log_period == 0: if hasLbl: print( "----------------------VALID SET--------------------------" ) print("Log test #:", num_tests) print("rewards: ", player.reward.mean()) print("sum rewards: ", reward_sum) print("------------------------------------------------") log_img = np.concatenate(renderlist, 0) if hasLbl: log_info = {"valid_sample": log_img} else: log_info = {"test_sample": log_img} for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) if hasLbl: log_info = {'mean_valid_reward': reward_mean} for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 player.clear_actions() state = player.env.reset() renderlist.append(player.env.render()) time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf): start_time = time.time() ptitle('Training Agent: {}'.format(rank)) #log = {} #setup_logger('{}_train_log'.format(args.env), r'{0}{1}_train_log'.format( # args.log_dir, args.env)) #log['{}_train_log'.format(args.env)] = logging.getLogger( # '{}_train_log'.format(args.env)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) if 'micropolis' in args.env.lower(): env = micropolis_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_sizes = player.model.getMemorySizes() else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) lstm_size = 512 if 'micropolis' in args.env.lower(): if 'arcade' not in args.env.lower(): lstm_size = (1, 16, env.env.env.MAP_X, env.env.env.MAP_Y) player.lstm_size = lstm_size player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 log_counter = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) num_lstm_layers = len(player.lstm_sizes) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(player.lstm_sizes[i]).cuda()) for i in range(num_lstm_layers) ] player.hx = [ Variable(torch.zeros(player.lstm_sizes[i]).cuda()) for i in range(num_lstm_layers) ] else: player.cx = [ Variable(torch.zeros(lstm_sizes[i])) for i in range(num_lstm_layers) ] player.hx = [ Variable(torch.zeros(lstm_sizes[i])) for i in range(num_lstm_layers) ] else: player.cx = [ Variable(player.cx[i].data) for i in range(num_lstm_layers) ] player.hx = [ Variable(player.hx[i].data) for i in range(num_lstm_layers) ] for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if args.randomize_exploration: player.certainty = np.random.uniform(0.5, 1.5) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: values, logit, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) if values.size()[1] == 1: value = values else: prob = torch.nn.functional.softmax(logit, dim=1) action = prob.multinomial(1).data value = values[0][action] R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() R = Variable(R).cuda() else: R = Variable(R) player.values.append(R) policy_loss = 0 value_loss = 0 for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): player.rewards[i] = torch.Tensor([player.rewards[i] ]).cuda() R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = Variable(gae.cuda()) else: gae = Variable(gae) policy_loss = policy_loss - \ player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i] #if log_counter % 10 == 0: # log['{}_train_log'.format(args.env)].info( # "Time {0}, reward {1}, policy loss {2}, value loss {3}, entropy {4}". # format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), # '{:9.2e}'.format(float(sum(player.rewards) / len(player.rewards))), # '{:9.2e}'.format(float(policy_loss.data.item())), # '{:9.2e}'.format(float(value_loss.data.item())), # '{:10.8e}'.format(float(sum(player.entropies))))) #log_counter += 1 optimizer.zero_grad() a3c = args.lmbda * (policy_loss + 0.5 * value_loss) a3c.backward() torch.nn.utils.clip_grad_norm_(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, input_model=None, max_iter=100000, step_test=-1, log=False): if rank >= 0: torch.manual_seed(args.seed + rank) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = create_env(args) env.seed(args.seed + rank) if log: log = setup_logger("{0}_{1}_log".format(args.scale_legs, rank), "logs/{0}_{1}_log".format(args.scale_legs, rank)) # player initialization player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == 'MLP': player.model = A3C_MLP(player.env.observation_space.shape[0], player.env.action_space, args.stack_frames) if args.model == 'CONV': player.model = A3C_CONV(args.stack_frames, player.env.action_space) # load the input model to the player if input_model != None: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(input_model.state_dict()) else: player.model.load_state_dict(input_model.state_dict()) # initialize the player optimizer optimizer = None if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(player.model.dictForOptimizer(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(player.model.dictForOptimizer(), lr=args.lr) else: optimizer = optim.SGD(player.model.dictForOptimizer(), lr=args.lr) # reset the environment and initialize the player state player.state = player.env.reset(args) player.state = torch.from_numpy(player.state).float() # If on GPU, do as GPU if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() last_iter = 0 mean_buf = Buffer(5) # Start looping over episodes for iteration in range(max_iter): last_iter += iteration # reset cx and hx if the enlvironmnent is over. if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) # Roll out actions and collect reward for one episode for step in range(args.num_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 # reset state state = player.env.reset(args) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = torch.zeros(1, 1).cuda() else: R = torch.zeros(1, 1) if not player.done: state = player.state if args.model == 'CONV': state = state.unsqueeze(0) value, _, _, _ = player.model( (Variable(state), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() else: gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() optimizer.step() player.clear_actions() if step_test > 0 and iteration % step_test == 0: tester = Tester(args, player.model) score = tester.test(last_iter) mean_buf.push(score) recent_mean = sum(mean_buf.bf) / mean_buf.current_size text = "Iteration {0}, episode reward {1}, recent reward mean {2}".format( iteration, score, recent_mean) log.info(text) tester = Tester(args, player.model) fitness = tester.test(last_iter) return fitness
log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format( args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = atari_env("{}".format(args.env), env_conf, args) num_tests = 0 start_time = time.time() reward_total_sum = 0 player = Agent(None, env, args, None) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.gpu_id = gpu_id if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() if args.new_gym_eval: player.env = gym.wrappers.Monitor( player.env, "{}_monitor".format(args.env), force=True) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(saved_state) else: player.model.load_state_dict(saved_state)
def test(args, shared_model, env_conf, shared_counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') log = {} setup_logger( '{}_log'.format(args.env), os.path.join(args.log_dir, '{}-{}_log'.format(args.env, args.exp_name))) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).to(torch.float32) player.model = player.model.to(device) player.state = player.state.to(device) flag = True max_score = 0 while True: if flag: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}, alpha {4:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, player.model.log_alpha.exp().detach().item())) if args.save_max and reward_sum >= max_score: max_score = reward_sum torch.save( player.model.state_dict(), os.path.join(args.save_model_dir, '{}-{}.dat'.format(args.env, args.exp_name))) with shared_counter.get_lock(): shared_counter.value += player.eps_len if shared_counter.value > args.interact_steps: break reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device)
def train(rank, args, shared_model, optimizer, train_modes, n_iters, env=None): n_steps = 0 n_iter = 0 writer = SummaryWriter(os.path.join(args.log_dir, 'Agent:{}'.format(rank))) ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) training_mode = args.train_mode env_name = args.env train_modes.append(training_mode) n_iters.append(n_iter) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) device = torch.device('cuda:' + str(gpu_id)) else: device = torch.device('cpu') if env == None: env = create_env(env_name) params = shared_model.parameters() if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(params, lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(params, lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None, device) player.gpu_id = gpu_id player.env.reset() # prepare model player.model = build_model(action_space=player.env.action_space, pose_space=player.reset_cam_pose(), args=args,) player.model = player.model.to(device) player.model.train() player.reset() reward_sum = torch.zeros(player.num_agents).to(device) count_eps = 0 print('Start training...') while True: # sys to the shared model player.model.load_state_dict(shared_model.state_dict()) if player.done: player.reset() reward_sum = torch.zeros(player.num_agents).to(device) count_eps += 1 player.update_rnn_hidden() fps_counter = 0 t0 = time.time() for i in range(args.num_steps): player.action_train() reward_sum += player.reward fps_counter += 1 n_steps += 1 if player.done: for i, r_i in enumerate(reward_sum): # add for Pose Only if i not in player.env.random_ids: continue # writer.add_scalar('train/reward_' + str(i), r_i, n_steps) break fps = fps_counter / (time.time() - t0) policy_loss, value_loss, entropies, pred_loss, values0 = player.optimize(params, optimizer, shared_model, gpu_id) writer.add_scalar('train/policy_loss_sum', policy_loss.sum(), n_steps) writer.add_scalar('train/value_loss_sum', value_loss.sum(), n_steps) writer.add_scalar('train/entropies_sum', entropies.sum(), n_steps) writer.add_scalar('train/values0', values0.sum(), n_steps) writer.add_scalar('train/pred_R_loss', pred_loss, n_steps) writer.add_scalar('train/fps', fps, n_steps) # writer.add_scalar('train/lr', lr[0], n_iter) n_iter += 1 n_iters[rank] = n_iter if train_modes[rank] == -100: env.close() break
def train_func(rank, args, shared_model, optimizer, env_conf, datasets=None, shared_dict=None): if args.deploy: return ptitle('Train {0}'.format(rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir[:-1] + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list, seed=args.seed + rank) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env.observation_space.shape, args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_reward.mean() eps_reward = 0 if args.lstm_feats: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=True) else: player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=False) elif args.lstm_feats: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): if rank < args.lbl_agents: player.action_train(use_lbl=True) else: player.action_train() if rank == 0: eps_reward = player.env.sum_reward.mean() if player.done: break if player.done: state = player.env.reset(player.model, gpu_id) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if "3D" in args.data: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.lowres: R = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if not player.done: if args.lstm_feats: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 if "3D" in args.data: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.rew_drop: keep_map = torch.tensor(player.env.keep_map) if args.lowres: gae = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() if args.rew_drop: keep_map = keep_map.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i if args.rew_drop: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage * keep_map).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t else: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if args.noisy: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () else: if args.rew_drop: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae) * keep_map).mean () - \ (args.entropy_alpha * player.entropies[i] * keep_map).mean () else: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) curtime = time.time() # print ("backward curtime:", curtime) sum_loss.backward() # print ("backward done", time.time () - curtime) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) curtime = time.time() # print ("optim curtime:", curtime) optimizer.step() # print ("optim done", time.time () - curtime) player.clear_actions() if args.wctrl == "s2m": player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] if rank == 0: train_step += 1 if train_step % args.log_period == 0 and train_step > 0: log_info = { 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: eps reward': pinned_eps_reward, } if "EX" in args.model: log_info["cell_prob_loss"] = cell_prob_loss for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def train(rank, args, shared_model, optimizer): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = create_env(args.env, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == 'MLP': player.model = A3C_MLP(player.env.observation_space.shape[0], player.env.action_space, args.stack_frames) if args.model == 'CONV': player.model = A3C_CONV(args.stack_frames, player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = torch.zeros(1, 1).cuda() else: R = torch.zeros(1, 1) if not player.done: state = player.state if args.model == 'CONV': state = state.unsqueeze(0) value, _, _, _ = player.model( (Variable(state), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() else: gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion # print(player.rewards[i]) delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) print("prank:", rank, "os.pid:", os.getpid()) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = AllowBacktracking( make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("Got a local env; obs space:", env.observation_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() print("player.state.shape:", player.state.shape) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: # if player.info['ale.lives'] == 0 or player.max_length: # player.eps_len = 0 state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = Environment() # 創建環境 if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) # env.seed(args.seed + rank) player = Agent(None, env, args, None) # 創建代理人 player.gpu_id = gpu_id num_actions = env.get_num_actions() player.model = A3Clstm( Config.STACKED_FRAMES, # A3C模型 num_actions) player.state, available = player.env.reset() # 初始環境 player.state = torch.from_numpy(player.state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.available = player.available.cuda() player.model.train() # 訓練模式 player.eps_len += 1 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) # 更新網路 if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) # 完成一次訓練 初始化LSTM else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): # T-max = 20 player.action_train() if player.done: break if player.done: state, available = player.env.reset() player.state = torch.from_numpy(state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.available = player.available.cuda() R = torch.zeros(1, 1) # if done : R_t-max = 0 if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data # R_t-max = V(s) if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf, num_tau_samples=32, num_tau_prime_samples=32, kappa=1.0, num_quantiles=32): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam( shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1,num_tau_prime_samples) if not player.done: logit, _, _ = player.model((Variable( player.state.unsqueeze(0)), (player.hx, player.cx))) q_vals = torch.mean(logit,0) _, action = torch.max(q_vals,0) logit, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = logit[:,action] if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() #R = R.detach() R = Variable(R) value_loss = 0 for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R.repeat(num_tau_samples,1) - player.logits_array[i].repeat(1, num_tau_prime_samples) #print("Ad: ",advantage) loss = (torch.abs(advantage) <= kappa).float() * 0.5 * advantage ** 2 #print("loss: ",loss.sum(0).sum(0), loss) loss += (torch.abs(advantage) > kappa).float() * kappa * (torch.abs(advantage) - 0.5 * kappa) #print("loss: ",loss.sum(0).sum(0), loss) step_loss = torch.abs(player.quantiles_array[i].cuda() - (advantage.detach()<0).float()) * loss/kappa value_loss += step_loss.sum(0).mean(0) player.model.zero_grad() value_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test_func(args, shared_model, env_conf, datasets=None, tests=None, shared_dict=None): ptitle('Valid agent') if args.valid_gpu < 0: gpu_id = args.gpu_ids[-1] else: gpu_id = args.valid_gpu env_conf["env_gpu"] = gpu_id if not args.deploy: log = {} logger = Logger(args.log_dir) create_dir(args.log_dir + "models/") create_dir(args.log_dir + "tifs/") create_dir(args.log_dir + "tifs_test/") os.system("cp *.py " + args.log_dir) os.system("cp *.sh " + args.log_dir) os.system("cp models/*.py " + args.log_dir + "models/") setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) env_conf_log = env_conf if tests is not None: if args.testlbl: test_env = EM_env(tests[0], env_conf, type="test", gt_lbl_list=tests[1]) else: test_env = EM_env(tests[0], env_conf, type="test") if not args.deploy: for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, d_args[k])) for k in env_conf_log.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, env_conf_log[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() flag = True if not args.deploy: create_dir(args.save_model_dir) recent_episode_scores = ScalaTracker(100) recent_FgBgDice = ScalaTracker(100) recent_bestDice = ScalaTracker(100) recent_diffFG = ScalaTracker(100) recent_MUCov = ScalaTracker(100) recent_MWCov = ScalaTracker(100) recent_AvgFP = ScalaTracker(100) recent_AvgFN = ScalaTracker(100) recent_rand_i = ScalaTracker(100) renderlist = [] renderlist.append(player.env.render()) max_score = 0 # ----------------------------------------- Deploy / Inference ----------------------------------------- if args.deploy: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) # inference (args, None, player.model, tests [0], test_env, gpu_id, player.env.rng, len (tests [0])) if len(tests) == 4: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0]), tests[3]) else: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0])) return # ----------------------------------------- End Deploy / Inference ----------------------------------------- merge_ratios = [] split_ratios = [] if args.wctrl == "s2m": schedule = args.wctrl_schedule delta = (shared_dict['spl_w'] - shared_dict['mer_w']) / (2 * len(schedule)) mer_w_delta = delta mer_w_var = shared_dict['mer_w'] mer_w_scheduler = Scheduler(mer_w_var, schedule, mer_w_delta) split_delta = -delta / len(args.out_radius) split_var = shared_dict['spl_w'] / len(args.out_radius) spl_w_scheduler = Scheduler(split_var, schedule, split_delta) while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() renderlist.append(player.env.render()) if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores.push(reward_sum) if args.save_max and recent_episode_scores.mean() >= max_score: max_score = recent_episode_scores.mean() if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = {} state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, str(num_tests))) if num_tests % args.log_period == 0: if tests is not None and not args.DEBUG: inference(args, logger, player.model, tests[0], test_env, gpu_id, player.env.rng, num_tests) if (np.max(env.lbl) != 0 and np.max(env.gt_lbl) != 0): bestDice, FgBgDice, diffFG, MWCov, MUCov, AvgFP, AvgFN, rand_i = evaluate( args, player.env) recent_FgBgDice.push(FgBgDice) recent_diffFG.push(abs(diffFG)) recent_bestDice.push(bestDice) recent_MWCov.push(MWCov) recent_MUCov.push(MUCov) recent_AvgFP.push(AvgFP) recent_AvgFN.push(AvgFN) recent_rand_i.push(rand_i) log_info = { "bestDice": recent_bestDice.mean(), "FgBgDice": recent_FgBgDice.mean(), "diffFG": recent_diffFG.mean(), "MWCov": recent_MWCov.mean(), "MUCov": recent_MUCov.mean(), "AvgFP": recent_AvgFP.mean(), "AvgFN": recent_AvgFN.mean(), "rand_i": recent_rand_i.mean() } for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) else: bestDice, FgBgDice, diffFG = 0, 0, 0 MWCov, MUCov, AvgFP, AvgFN = 0, 0, 0, 0 rand_i = 0 print( "----------------------VALID SET--------------------------" ) print(args.env) print("bestDice:", bestDice, "FgBgDice:", FgBgDice, "diffFG:", diffFG, "MWCov:", MWCov, "MUCov:", MUCov, "AvgFP:", AvgFP, "AvgFN:", AvgFN, "rand_i:", rand_i) # print ("mean bestDice") print("Log test #:", num_tests) print("rewards: ", player.reward.mean()) print("sum rewards: ", reward_sum) print("#gt_values:", len(np.unique(player.env.gt_lbl))) print("values:") values = player.env.unique() print(np.concatenate([values[0][None], values[1][None]], 0)) print("------------------------------------------------") log_img = np.concatenate(renderlist[::-1], 0) if not "3D" in args.data: for i in range(3): player.probs.insert(0, np.zeros_like(player.probs[0])) while (len(player.probs) - 3 < args.max_episode_length): player.probs.append(np.zeros_like(player.probs[0])) probslist = [ np.repeat(np.expand_dims(prob, -1), 3, -1) for prob in player.probs ] probslist = np.concatenate(probslist, 1) probslist = (probslist * 256).astype(np.uint8, copy=False) # log_img = renderlist [-1] print(probslist.shape, log_img.shape) log_img = np.concatenate([probslist, log_img], 0) log_info = {"valid_sample": log_img} print(log_img.shape) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_sample.tif", log_img.astype(np.uint8)) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_pred.tif", player.env.lbl.astype(np.uint8)) io.imsave(args.log_dir + "tifs/" + str(num_tests) + "_gt.tif", player.env.gt_lbl.astype(np.int32)) if args.seg_scale: log_info["scaler"] = player.env.scaler for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) if not args.deploy: log_info = { 'mean_valid_reward': reward_mean, '100_mean_reward': recent_episode_scores.mean(), 'split_ratio': player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), 'merge_ratio': player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), } if args.wctrl == 's2m': log_info.update({ 'mer_w': mer_w_scheduler.value(), 'spl_w': spl_w_scheduler.value() * len(args.out_radius), }) merge_ratios.append(player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) split_ratios.append(player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) print("split ratio: ", np.max(player.env.split_ratio_sum), np.min(player.env.split_ratio_sum)) print("merge ratio: ", np.max(player.env.merge_ratio_sum), np.min(player.env.merge_ratio_sum)) print("merge ratio: ", merge_ratios) print("split ratio: ", split_ratios) for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 if args.wctrl == "s2m": shared_dict["spl_w"] = spl_w_scheduler.next() shared_dict["mer_w"] = mer_w_scheduler.next() player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] player.clear_actions() state = player.env.reset(player.model, gpu_id) renderlist.append(player.env.render()) time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(args, envs, observation_space, action_space): gpu_id = 0 #每个单独的work,独立的环境和model,在cuda中运行 player = Agent(envs, args) player.model = A3Clstm(observation_space, action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.cx = torch.zeros(args.workers, 512).cuda() player.hx = torch.zeros(args.workers, 512).cuda() optimizer = torch.optim.Adam(player.model.parameters(), lr=args.lr, amsgrad=args.amsgrad) #切换到训练模式 player.model.train() while True: #训练20步或者game over就结束训练 for step in range(args.num_steps): #训练时,保存每一步的相关信息到list player.env.get_images() player.action_train() if player.dones[-1][0]: break if not player.dones[-1][0]: value, _, _ = player.model((player.state, (player.hx, player.cx))) R = value.detach() else: R = torch.zeros(args.workers, 1) with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(R) for j in range(args.num_ppo_train): policy_loss = 0 value_loss = 0 gae = 0 for i in reversed(range(len(player.rewards))): value, logit, _ = player.model( (player.states[i], (player.hxs[i], player.cxs[i]))) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) entropy = -(log_prob * prob).sum(1) log_probs_current = log_prob.gather(1, player.actions[i]) R = args.gamma * R + player.rewards[i] advantage = R - value value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[ i + 1].detach() - player.values[i].detach() gae = gae * args.gamma * args.tau + delta_t ratio = torch.exp(log_probs_current - player.log_probs[i]) surr1 = ratio surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) policy_loss = policy_loss - torch.min( surr1, surr2) * gae * -0.01 * entropy optimizer.zero_grad() (policy_loss + 0.5 * value_loss).mean().backward() optimizer.step() #game over时记忆值重置 if player.dones[-1][0]: with torch.cuda.device(gpu_id): player.cx = torch.zeros(args.workers, 512).cuda() player.hx = torch.zeros(args.workers, 512).cuda() else: player.cx = player.cx.detach() player.hx = player.hx.detach() player.clear_actions() # advantage[0:n] # 第0,1,2,...n 到 n+1的估值差 r[0-n],r[1-n],r[2-n]....rn Value(N+1) 取反: # 第n,n-1,n-2,n-3,......3,2,1 # r[n] + Value(N+1) - Value(N) # r[n:n-1] + Value(N+1) - Value(N-1) # ... # r[n:2] + Value(N + 1) - Value(2) # r[n:1] + Value(N + 1) - Value(1) # R = args.gamma * R + player.rewards[i] # advantage = R - player.values[i] # value_loss = value_loss + 0.5 * advantage.pow(2) # value_loss = 0.5 * advantage.pow(2) # advantage = args.gamma * R + player.rewards[i] - player.values[i] #entropy = -(log_prob * prob).sum(1) #self.entropies.append(entropy) #通过prob 采样对应的动作和动作logprob # 计算每次的概率和entropy(entropies)和entropy的sum,sum是每一步所有动作概率的熵值
def train (rank, args, shared_model, optimizer, env_conf, datasets=None): ptitle('Training Agent: {}'.format(rank)) print ('Start training agent: ', rank) if rank == 0: logger = Logger (args.log_dir) train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf ["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) if "EM_env" in args.env: raw, lbl, prob, gt_lbl = datasets env = EM_env (raw, lbl, prob, env_conf, 'train', gt_lbl) else: env = Voronoi_env (env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop (shared_model.parameters (), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam (shared_model.parameters (), lr=args.lr, amsgrad=args.amsgrad) # env.seed (args.seed + rank) if not args.continuous: player = Agent (None, env, args, None) else: player = Agent_continuous (None, env, args, None) player.gpu_id = gpu_id if not args.continuous: player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) else: player.model = A3Clstm_continuous (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.state = player.env.reset () player.state = torch.from_numpy (player.state).float () old_score = player.env.old_score final_score = 0 if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () player.model = player.model.cuda () player.model.train () if rank == 0: eps_reward = 0 pinned_eps_reward = 0 mean_log_prob = 0 # print ("rank: ", rank) while True: if gpu_id >= 0: with torch.cuda.device (gpu_id): player.model.load_state_dict (shared_model.state_dict ()) else: player.model.load_state_dict (shared_model.state_dict ()) if player.done: player.eps_len = 0 if rank == 0: if 0 <= (train_step % args.train_log_period) < args.max_episode_length: print ("train: step", train_step, "\teps_reward", eps_reward, "\timprovement", final_score - old_score) old_score = player.env.old_score pinned_eps_reward = eps_reward eps_reward = 0 mean_log_prob = 0 if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, args.hidden_feat).cuda()) player.hx = Variable(torch.zeros(1, args.hidden_feat).cuda()) else: player.cx = Variable(torch.zeros(1, args.hidden_feat)) player.hx = Variable(torch.zeros(1, args.hidden_feat)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train () if rank == 0: # if 0 <= (train_step % args.train_log_period) < args.max_episode_length: # print ("train: step", train_step, "\taction = ", player.action) eps_reward += player.reward # print (eps_reward) mean_log_prob += player.log_probs [-1] / env_conf ["T"] if player.done: break if player.done: # if rank == 0: # print ("----------------------------------------------") final_score = player.env.old_score state = player.env.reset () player.state = torch.from_numpy (state).float () if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () R = torch.zeros (1, 1) if not player.done: if not args.continuous: value, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = player.values[i + 1].data * args.gamma + player.rewards[i] - \ player.values[i].data gae = gae * args.gamma * args.tau + delta_t # print (player.rewards [i]) if not args.continuous: policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] else: policy_loss = policy_loss - \ player.log_probs[i].sum () * Variable(gae) - \ 0.01 * player.entropies[i].sum () player.model.zero_grad () sum_loss = (policy_loss + value_loss) sum_loss.backward () ensure_shared_grads (player.model, shared_model, gpu=gpu_id >= 0) optimizer.step () player.clear_actions () if rank == 0: train_step += 1 if train_step % args.log_period == 0: log_info = { # 'train: sum_loss': sum_loss, 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: advanage': advantage, # 'train: entropy': entropy, 'train: eps reward': pinned_eps_reward, # 'train: mean log prob': mean_log_prob } for tag, value in log_info.items (): logger.scalar_summary (tag, value, train_step)
def test(args, shared_model, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) print("test proc:") env = AllowBacktracking(make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("test got env:", env.observation_space) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward """ if player.done and player.info['ale.lives'] > 0 and not player.max_length: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() """ if player.done or player.max_length: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) state = env.reset() player = Agent(model, env, args, state) player.state = torch.from_numpy(state).float() player.model.train() epoch = 0 while True: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) if player.starter: player = player_start(player, train=True) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player = player_act(player, train=True) if player.done: break if player.current_life > player.info['ale.lives']: player.flag = True player.current_life = player.info['ale.lives'] else: player.current_life = player.info['ale.lives'] player.flag = False if args.count_lives: if player.flag: player.done = True break if player.starter and player.flag: player = player_start(player, train=True) if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() player.flag = False R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss += 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + value_loss).backward() ensure_shared_grads(player.model, shared_model) optimizer.step() player.values = [] player.log_probs = [] player.rewards = [] player.entropies = []
def test(args, shared_model, env_conf, datasets): ptitle('Test agent') gpu_id = args.gpu_ids[-1] log = {} logger = Logger(args.log_dir) setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) raw, gt_lbl = datasets env = EM_env(raw, gt_lbl, env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id # player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.model = SimpleCNN(env.observation_space.shape, env_conf["num_action"]) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True create_dir(args.save_model_dir) recent_episode_scores = [] renderlist = [] renderlist.append(player.env.render()) max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward renderlist.append(player.env.render()) if player.done: flag = True if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores += [reward_sum] if len(recent_episode_scores) > 200: recent_episode_scores.pop(0) if args.save_max and np.mean(recent_episode_scores) >= max_score: max_score = np.mean(recent_episode_scores) if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env + '_' + str(num_tests))) if num_tests % args.log_period == 0: print("------------------------------------------------") print("Log test #:", num_tests) print("Prob: ") for i in range(player.env.agent_out_shape[1]): for j in range(player.env.agent_out_shape[2]): print("{:.3f}\t".format(player.prob_cpu[0, i, j]), end='') print() print("Actions :", player.actions) print("Actions transformed: ") print(player.actions_explained) print("rewards: ", player.rewards) print("sum rewards: ", reward_sum) print("------------------------------------------------") log_img = np.concatenate(renderlist, 0) log_info = {"test: traning_sample": log_img} for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) log_info = {'test: mean_reward': reward_mean} for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 time.sleep(30) player.clear_actions() state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()