def __init__(self, env, args, gpu_id): self.results_filename = "./results" self.env = env self.models = [ A3Clstm(self.env.observation_space.shape[0], self.env.action_space), A3Clstm(self.env.observation_space.shape[0], self.env.action_space) ] self.state = None self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True self.info = None self.reward = 0 self.gpu_id = gpu_id self.episodic_reward = 0 self.life_counter = 5 self.model_sequence = [] self.curr_model_id = 0 self.first_time_changeover = True self.fire_action_next = True if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.models[0] = self.models[0].cuda() self.models[1] = self.models[1].cuda() with open(self.results_filename, 'w'): pass
def test(args, shared_model, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = atari_env(args.env, env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.eval() for t in itertools.count(): if player.done: player.model.load_state_dict(shared_model.state_dict()) player.action_test(t) reward_sum += player.reward if player.done: num_tests += 1 player.current_life = 0 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if reward_sum > args.save_score_level: player.model.load_state_dict(shared_model.state_dict()) state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') #; multiprocrssing setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] env = atari_env(args.env, env_conf, args) shared_model = A3Clstm(env.observation_space.shape[0], env.action_space) # main A3C if args.load: # if --load is True, load the .dat file. saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] torch.set_default_tensor_type('torch.FloatTensor') saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format(args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) env = atari_env("{}".format(args.env), env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) num_tests = 0 reward_total_sum = 0 player = Agent(model, env, args, state=None) player.env = gym.wrappers.Monitor(player.env, "{}_monitor".format(args.env), force=True) player.model.eval() for i_episode in range(args.num_episodes): state = player.env.reset() player.state = torch.from_numpy(state).float() player.eps_len = 0 reward_sum = 0 while True: if args.render:
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) print("prank:", rank, "os.pid:", os.getpid()) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = AllowBacktracking( make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("Got a local env; obs space:", env.observation_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() print("player.state.shape:", player.state.shape) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: # if player.info['ale.lives'] == 0 or player.max_length: # player.eps_len = 0 state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock, counter): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] player.hx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] else: player.cx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] player.hx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] else: player.cx = [ Variable(player.cx[0].data), Variable(player.cx[1].data) ] player.hx = [ Variable(player.hx[0].data), Variable(player.cx[1].data) ] # 测试rnet的更新有没有影响到这里 # ps = list(player.model.r_net.named_parameters()) # n, v = ps[6] # print(v.sum()) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1]))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] with lock: counter.value += 1 # rnet player.model.r_net.zero_grad() (args.actor_weight * policy_loss + (1 - args.actor_weight) * value_loss).backward(retain_graph=True) ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) optimizer_r.step() player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() player.model.r_net.zero_grad() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env_name, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) _ = env.reset() action = env.action_space.sample() _, _, _, info = env.step(action) start_lives = info['ale.lives'] if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() env.seed(args.seed + rank) state = env.reset() state = torch.from_numpy(state).float() done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 512)) hx = Variable(torch.zeros(1, 512)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, info = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length if args.count_lives: if start_lives > info['ale.lives']: done = True reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state).float() values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def train(args, envs, observation_space, action_space): gpu_id = 0 #每个单独的work,独立的环境和model,在cuda中运行 player = Agent(envs, args) player.model = A3Clstm(observation_space, action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.cx = torch.zeros(args.workers, 512).cuda() player.hx = torch.zeros(args.workers, 512).cuda() optimizer = torch.optim.Adam(player.model.parameters(), lr=args.lr, amsgrad=args.amsgrad) #切换到训练模式 player.model.train() while True: #训练20步或者game over就结束训练 for step in range(args.num_steps): #训练时,保存每一步的相关信息到list player.env.get_images() player.action_train() if player.dones[-1][0]: break if not player.dones[-1][0]: value, _, _ = player.model((player.state, (player.hx, player.cx))) R = value.detach() else: R = torch.zeros(args.workers, 1) with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(R) for j in range(args.num_ppo_train): policy_loss = 0 value_loss = 0 gae = 0 for i in reversed(range(len(player.rewards))): value, logit, _ = player.model( (player.states[i], (player.hxs[i], player.cxs[i]))) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) entropy = -(log_prob * prob).sum(1) log_probs_current = log_prob.gather(1, player.actions[i]) R = args.gamma * R + player.rewards[i] advantage = R - value value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[ i + 1].detach() - player.values[i].detach() gae = gae * args.gamma * args.tau + delta_t ratio = torch.exp(log_probs_current - player.log_probs[i]) surr1 = ratio surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) policy_loss = policy_loss - torch.min( surr1, surr2) * gae * -0.01 * entropy optimizer.zero_grad() (policy_loss + 0.5 * value_loss).mean().backward() optimizer.step() #game over时记忆值重置 if player.dones[-1][0]: with torch.cuda.device(gpu_id): player.cx = torch.zeros(args.workers, 512).cuda() player.hx = torch.zeros(args.workers, 512).cuda() else: player.cx = player.cx.detach() player.hx = player.hx.detach() player.clear_actions() # advantage[0:n] # 第0,1,2,...n 到 n+1的估值差 r[0-n],r[1-n],r[2-n]....rn Value(N+1) 取反: # 第n,n-1,n-2,n-3,......3,2,1 # r[n] + Value(N+1) - Value(N) # r[n:n-1] + Value(N+1) - Value(N-1) # ... # r[n:2] + Value(N + 1) - Value(2) # r[n:1] + Value(N + 1) - Value(1) # R = args.gamma * R + player.rewards[i] # advantage = R - player.values[i] # value_loss = value_loss + 0.5 * advantage.pow(2) # value_loss = 0.5 * advantage.pow(2) # advantage = args.gamma * R + player.rewards[i] - player.values[i] #entropy = -(log_prob * prob).sum(1) #self.entropies.append(entropy) #通过prob 采样对应的动作和动作logprob # 计算每次的概率和entropy(entropies)和entropy的sum,sum是每一步所有动作概率的熵值
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = Environment() # 創建環境 if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) # env.seed(args.seed + rank) player = Agent(None, env, args, None) # 創建代理人 player.gpu_id = gpu_id num_actions = env.get_num_actions() player.model = A3Clstm( Config.STACKED_FRAMES, # A3C模型 num_actions) player.state, available = player.env.reset() # 初始環境 player.state = torch.from_numpy(player.state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.available = player.available.cuda() player.model.train() # 訓練模式 player.eps_len += 1 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) # 更新網路 if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) # 完成一次訓練 初始化LSTM else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): # T-max = 20 player.action_train() if player.done: break if player.done: state, available = player.env.reset() player.state = torch.from_numpy(state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.available = player.available.cuda() R = torch.zeros(1, 1) # if done : R_t-max = 0 if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data # R_t-max = V(s) if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
# Implemented multiprocessing using locks but was not beneficial. Hogwild # training was far superior if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') setup_json = read_config(args.env_config) obs_shape = setup_json["Spaces"]["observation_channels"] action_space = gym.spaces.Discrete(setup_json["Spaces"]["action_shape"]) shared_model = A3Clstm(obs_shape, action_space) if args.load: saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) optimizer.share_memory()
def test(args, shared_model, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format( args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state).float() reward_sum = 0 done = True start_time = time.time() episode_length = 0 num_tests = 0 reward_total_sum = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 512), volatile=True) hx = Variable(torch.zeros(1, 512), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable( state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward if done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length, reward_mean)) if reward_sum > args.save_score_level: model.load_state_dict(shared_model.state_dict()) state_to_save = model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env)) reward_sum = 0 episode_length = 0 state = env.reset() time.sleep(60) state = torch.from_numpy(state).float()
def train(rank, args, shared_model, optimizer, env_conf, iters, checkpoint_path): iters = dill.loads(iters) if args.enable_gavel_iterator and rank == 0: iters._init_logger() ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 elapsed_time = 0 start_time = time.time() for i in iters: if i % 100 == 0: print('GPU %d finished step %d' % (rank, i), flush=True) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() elapsed_time += time.time() - start_time start_time = time.time() if (args.throughput_estimation_interval is not None and i % args.throughput_estimation_interval == 0 and rank == 0): print('[THROUGHPUT_ESTIMATION]\t%s\t%d' % (time.time(), i)) if (args.max_duration is not None and elapsed_time >= args.max_duration): break if args.enable_gavel_iterator and rank == 0: state = shared_model.state_dict() iters.save_checkpoint(state, checkpoint_path) iters.complete()
if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] env = atari_env(args.env, env_conf, args) shared_model = A3Clstm(env.observation_space.shape[0], env.action_space, args.terminal_prediction, args.reward_prediction ) # this is global NN copy workers sync to-from ... if args.load: saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
def train(rank, args, shared_model, optimizer): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] writer = SummaryWriter(log_dir=args.log_dir + 'tb_train') log = {} setup_logger('{}_train_log'.format(rank), r'{0}{1}_train_log'.format(args.log_dir, rank)) log['{}_train_log'.format(rank)] = logging.getLogger( '{}_train_log'.format(rank)) torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(env_id=rank, args=args, type='train') if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[2], player.env.action_space.n) player.state = player.env.reset() player.state = normalize_rgb_obs(player.state) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() num_trains = 0 if not os.path.exists(args.log_dir + "images/"): os.makedirs(args.log_dir + "images/") while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: num_trains += 1 log['{}_train_log'.format(rank)].info('entropy:{0}'.format( player.entropy.data[0])) writer.add_scalar("data/entropy_" + str(rank), player.entropy.data[0], num_trains) writer.add_image('FCN_' + str(rank), player.fcn, num_trains) writer.add_image('Depth_GroundTruth_' + str(rank), player.depth, num_trains) writer.add_image('RGB_' + str(rank), player.env.get_rgb(), num_trains) save_image( player.fcn.data, args.log_dir + "images/" + str(rank) + "_" + str(num_trains) + "_fcn.png") # print("player.fcn.data:", player.fcn.data) save_image( player.depth.data, args.log_dir + "images/" + str(rank) + "_" + str(num_trains) + "_depth.png") cv2.imwrite( args.log_dir + "images/" + str(rank) + "_" + str(num_trains) + "_rgb.png", player.env.get_rgb()) # print("player.depth.data:", player.depth.data) player.eps_len = 0 player.current_life = 0 state = player.env.reset() state = normalize_rgb_obs(state) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: with torch.cuda.device(gpu_id): value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx), Variable( torch.from_numpy(player.env.target).type( torch.FloatTensor).cuda()))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = args.gamma * player.values[ i + 1].data + player.rewards[i] - player.values[i].data gae = gae * args.gamma * args.tau + delta_t # policy_loss = policy_loss - \ # player.log_probs[i] * \ # Variable(gae) - 0.01 * player.entropies[i] \ # + player.fcn_losses[i] # FCN policy_loss = policy_loss - 1e-5*(player.log_probs[i] * Variable(gae)) - 1e-5*(0.01 * player.entropies[i]) \ + player.fcn_losses[i] * DEPTH_LOSS_DISCOUNT # FCN # policy_loss = policy_loss + player.fcn_losses[i] # FCN writer.add_scalar("data/value_loss_" + str(rank), value_loss, num_trains) writer.add_scalar("data/policy_loss_" + str(rank), policy_loss, num_trains) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test(args, shared_model, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) state = env.reset() reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(model, env, args, state) player.state = torch.from_numpy(state).float() player.model.eval() while True: if player.starter and player.flag: player = player_start(player) else: player.flag = False if player.done and not player.flag: player.model.load_state_dict(shared_model.state_dict()) player.cx = Variable(torch.zeros(1, 512), volatile=True) player.hx = Variable(torch.zeros(1, 512), volatile=True) player.flag = False elif not player.flag: player.cx = Variable(player.cx.data, volatile=True) player.hx = Variable(player.hx.data, volatile=True) player.flag = False if not player.flag: player, reward = player_act(player, train=False) reward_sum += reward if not player.done: if player.current_life > player.info['ale.lives']: player.flag = True player.current_life = player.info['ale.lives'] else: player.current_life = player.info['ale.lives'] player.flag = False if player.done: num_tests += 1 player.current_life = 0 player.flag = True reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if reward_sum > args.save_score_level: player.model.load_state_dict(shared_model.state_dict()) state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
def test(args, shared_model, env_conf, shared_counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') log = {} setup_logger( '{}_log'.format(args.env), os.path.join(args.log_dir, '{}-{}_log'.format(args.env, args.exp_name))) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).to(torch.float32) player.model = player.model.to(device) player.state = player.state.to(device) flag = True max_score = 0 while True: if flag: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}, alpha {4:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, player.model.log_alpha.exp().detach().item())) if args.save_max and reward_sum >= max_score: max_score = reward_sum torch.save( player.model.state_dict(), os.path.join(args.save_model_dir, '{}-{}.dat'.format(args.env, args.exp_name))) with shared_counter.get_lock(): shared_counter.value += player.eps_len if shared_counter.value > args.interact_steps: break reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device)
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = Environment(True) #atari_env(True) #env = atari_env("{}".format(args.env), env_conf, args) num_tests = 0 start_time = time.time() reward_total_sum = 0 player = Agent(None, env, args, None) num_actions = env.get_num_actions() player.model = A3Clstm(Config.STACKED_FRAMES, num_actions) player.gpu_id = gpu_id if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(saved_state) else: player.model.load_state_dict(saved_state) player.model.eval() # model變成测试模式 for i_episode in range(args.num_episodes): player.state ,_= player.env.reset()
def train_rep(args, shared_model, env_conf): batch_size = 16 train_times = args.rep_train_time trace = [] td_class = [(0, 1), (1, 2), (2, 3), (3, 5), (5, 7), (7, 9)] loss_fn = nn.CrossEntropyLoss() optimizer_r = Adam(shared_model.r_net.parameters(), lr=args.rl_r) optimizer_c = Adam(shared_model.c_net.parameters(), lr=args.rl_r) ptitle('Train rep') gpu_id = args.gpu_ids[-1] torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() # player.model.r_net = player.model.r_net.cuda() # player.model.c_net = player.model.c_net.cuda() flag = True while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.train() flag = False player.action_test() trace.append(player.state) if len(trace) > args.trace_length: # 训练几百次 for _ in range(train_times): range_c = np.random.randint(0, len(td_class)) TD = np.random.randint(td_class[range_c][0], td_class[range_c][1]) begin = np.random.randint(0, len(trace) - TD - batch_size) former = torch.stack(trace[begin:begin + batch_size], dim=0) latter = torch.stack(trace[begin + TD:begin + TD + batch_size], dim=0) target = torch.zeros(batch_size, dtype=torch.long) + range_c if gpu_id >= 0: with torch.cuda.device(gpu_id): former = former.cuda() latter = latter.cuda() target = target.cuda() rep_f, rep_l = player.model.r_net(former), player.model.r_net( latter) output = player.model.c_net(rep_f, rep_l, False) loss = loss_fn(output, target) optimizer_r.zero_grad() optimizer_c.zero_grad() loss.backward() ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) ensure_shared_grads(player.model.c_net, shared_model.c_net, gpu=gpu_id >= 0) optimizer_r.step() optimizer_c.step() trace = [] if player.done and not player.info: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True state = player.env.reset() time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
# Based on # https://github.com/pytorch/examples/tree/master/mnist_hogwild # Training settings # Implemented multiprocessing using locks but was not beneficial. Hogwild # training was far superior if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') shared_model = A3Clstm(OBSERVATION_SPACE_SHAPE[3], ACTION_SIZE) if args.load: saved_state = torch.load('{0}{1}.dat'.format( args.load_model_dir, args.env), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam( shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) optimizer.share_memory() else: optimizer = None
def train(rank, args, shared_model, optimizer, env_conf, shared_counter, targ_shared): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') torch.manual_seed(args.seed + rank) torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.state = torch.from_numpy(player.state).to(torch.float32) player.state = player.state.to(device) player.model = player.model.to(device) #player.targ_model = copy.deepcopy(player.model) player.model.train() #player.targ_model.eval() player.eps_len += 2 while True: player.model.load_state_dict(shared_model.state_dict()) #player.targ_model.load_state_dict(targ_shared.state_dict()) if player.done: player.cx = torch.zeros(1, 512).to(device) player.hx = torch.zeros(1, 512).to(device) #player.targ_cx = copy.deepcopy(player.cx).detach() #player.targ_hx = copy.deepcopy(player.hx).detach() else: player.cx = player.cx.detach() player.hx = player.hx.detach() for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) #alpha = player.model.log_alpha.exp().detach() alpha = .01 #alpha = 0 x_R = torch.zeros(1, 1) if not player.done: with torch.no_grad(): action, value, logit, q_value, _ = player.model( (player.state.unsqueeze(0), (player.hx, player.cx))) x_R = q_value[1].detach() - alpha * F.log_softmax( logit, -1).gather(-1, action) x_R = x_R.to(device) policy_loss = 0 adv_gae_loss = 0 for i in reversed(range(len(player.rewards))): x_R = args.gamma * x_R + player.rewards[i] adv_gae_loss = adv_gae_loss + (player.tra_adv_gae[i][1] - x_R.detach()).pow(2) * .5 #policy_loss = policy_loss - player.log_probs[i] * player.tra_adv_gae[i][0].detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() policy_loss = policy_loss - (F.softmax( player.values[i], -1) * player.tra_adv_gae[i][0].detach()).sum( -1) - alpha * player.entropies[i].unsqueeze(-1) #policy_loss = policy_loss - player.log_probs[i] * (x_R - (F.softmax(player.values[i], -1) * # player.tra_adv_gae[i][0]).sum(-1) - alpha * player.entropies[i]).detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() #prob = F.softmax(player.values[i], -1) #ent_alpha = alpha * player.entropies[i].unsqueeze(-1) #advs = (player.tra_adv_gae[i][0] - # ((player.tra_adv_gae[i][0] * prob).sum(-1, True) + # ent_alpha)).detach() #policy_loss = policy_loss - (prob * advs).sum(-1) - ent_alpha x_R = x_R - alpha * player.log_probs[i].detach() player.model.zero_grad() (policy_loss + .5 * adv_gae_loss).backward(retain_graph=False) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() with shared_counter.get_lock(): shared_counter.value += len(player.rewards) if shared_counter.value > args.interact_steps: break
def test(args, shared_model, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) print("test proc:") env = AllowBacktracking(make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("test got env:", env.observation_space) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward """ if player.done and player.info['ale.lives'] > 0 and not player.max_length: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() """ if player.done or player.max_length: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf, num_tau_samples=32, num_tau_prime_samples=32, kappa=1.0, num_quantiles=32): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam( shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1,num_tau_prime_samples) if not player.done: logit, _, _ = player.model((Variable( player.state.unsqueeze(0)), (player.hx, player.cx))) q_vals = torch.mean(logit,0) _, action = torch.max(q_vals,0) logit, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = logit[:,action] if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() #R = R.detach() R = Variable(R) value_loss = 0 for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R.repeat(num_tau_samples,1) - player.logits_array[i].repeat(1, num_tau_prime_samples) #print("Ad: ",advantage) loss = (torch.abs(advantage) <= kappa).float() * 0.5 * advantage ** 2 #print("loss: ",loss.sum(0).sum(0), loss) loss += (torch.abs(advantage) > kappa).float() * kappa * (torch.abs(advantage) - 0.5 * kappa) #print("loss: ",loss.sum(0).sum(0), loss) step_loss = torch.abs(player.quantiles_array[i].cuda() - (advantage.detach()<0).float()) * loss/kappa value_loss += step_loss.sum(0).mean(0) player.model.zero_grad() value_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format( args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = atari_env("{}".format(args.env), env_conf, args) num_tests = 0 start_time = time.time() reward_total_sum = 0 player = Agent(None, env, args, None) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.gpu_id = gpu_id if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() if args.new_gym_eval: player.env = gym.wrappers.Monitor( player.env, "{}_monitor".format(args.env), force=True) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(saved_state) else: player.model.load_state_dict(saved_state) player.model.eval()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() while True: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() if args.count_lives: player.check_state() if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model) optimizer.step() player.clear_actions()
def test(args, shared_model, env_conf, lock, counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger( '{}_log'.format(args.env), r'{0}{1}-{2}_log'.format(args.log_dir, args.env, args.log_target)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests with lock: counter.value += 1 log['{}_log'.format(args.env)].info( "UpdateStep {0} Time {1}, episode reward {2}, episode length {3}, reward mean {4:.4f}" .format( counter.value, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
# python main.py --env Pong-v0 --workers 7 --gpu-ids 0 --amsgrad True --pre-rnet 1wsam --rep-train-time 10 --trace-length 50 --log-target name if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] env = atari_env(args.env, env_conf, args) shared_model = A3Clstm(env.observation_space.shape[0], env.action_space, args.pre_rnet) if args.load: saved_state = torch.load('{0}{1}_{2}.dat'.format( args.load_model_dir, args.env, args.log_target), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) optimizer_r = SharedAdam(shared_model.r_net.parameters(),
if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] env = Environment() num_actions = env.get_num_actions() shared_model = A3Clstm(Config.STACKED_FRAMES, num_actions) if args.load: saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) optimizer.share_memory()
def train(rank, reward_type, args, shared_model, optimizer, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) env.seed(args.seed + rank) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, reward_type) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() for i in itertools.count(): if i % 10 == 0: print("reward type {0}, iter {1}".format(reward_type, i)) player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() reward_sum += player.reward if args.count_lives: player.check_state() if player.done: break if player.done: num_tests += 1 player.current_life = 0 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) state = env.reset() player = Agent(model, env, args, state) player.state = torch.from_numpy(state).float() player.model.train() epoch = 0 while True: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) if player.starter: player = player_start(player, train=True) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player = player_act(player, train=True) if player.done: break if player.current_life > player.info['ale.lives']: player.flag = True player.current_life = player.info['ale.lives'] else: player.current_life = player.info['ale.lives'] player.flag = False if args.count_lives: if player.flag: player.done = True break if player.starter and player.flag: player = player_start(player, train=True) if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() player.flag = False R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss += 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + value_loss).backward() ensure_shared_grads(player.model, shared_model) optimizer.step() player.values = [] player.log_probs = [] player.rewards = [] player.entropies = []
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) tp_weight = args.tp player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space, args.terminal_prediction, args.reward_prediction) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() # Below is where the cores are running episodes continously ... average_ep_length = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.eps_len += 1 player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 reward_pred_loss = 0 terminal_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) # TODO why this is here? for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[ i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - player.log_probs[i] * Variable( gae) - 0.01 * player.entropies[i] if args.reward_prediction: reward_pred_loss = reward_pred_loss + ( player.reward_predictions[i] - player.rewards[i]).pow(2) if args.terminal_prediction: # new way of using emprical episode length as a proxy for current length. if player.average_episode_length is None: end_predict_labels = np.arange( player.eps_len - len(player.terminal_predictions), player.eps_len) / player.eps_len # heuristic else: end_predict_labels = np.arange( player.eps_len - len(player.terminal_predictions), player.eps_len) / player.average_episode_length for i in range(len(player.terminal_predictions)): terminal_loss = terminal_loss + ( player.terminal_predictions[i] - end_predict_labels[i]).pow(2) terminal_loss = terminal_loss / len(player.terminal_predictions) player.model.zero_grad() #print(f"policy loss {policy_loss} and value loss {value_loss} and terminal loss {terminal_loss} and reward pred loss {reward_pred_loss}") total_loss = policy_loss + 0.5 * value_loss + tp_weight * terminal_loss + 0.5 * reward_pred_loss total_loss.backward() # will free memory ... # Visualize Computation Graph #graph = make_dot(total_loss) #from graphviz import Source #Source.view(graph) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if player.done: if player.average_episode_length is None: # initial one player.average_episode_length = player.eps_len else: player.average_episode_length = int( 0.99 * player.average_episode_length + 0.01 * player.eps_len) #print(player.average_episode_length, 'current one is ', player.eps_len) player.eps_len = 0 # reset here