def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0]) reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.lstm_size) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() #actions=deque(maxlen=100) episode_length = 0 currentPath = os.getcwd() File = open(currentPath + '/record.txt', 'a+') print("\n\n\n\n------------------------------\n\n\n\n\n") File.write("\n\n\n\n------------------------------\n\n\n\n\n") File.close() cnt = 0 episode_number = 0 while True: env.render() cnt = cnt + 1 episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.lstm_size), volatile=True) cx = Variable(torch.zeros(1, args.lstm_size), volatile=True) else: hx = Variable(hx.data, volatile=True) cx = Variable(cx.data, volatile=True) #print(state) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) #action=prob.max(1)[1].data.numpy() action = prob.multinomial().data #if(args.env_name=='Breakout-v3'): # state,reward,done,_=env.step(1) # reward_sum+=reward #state,reward,done,_ =env.step(action[0,0]) state, reward, done, _ = env.step(action.numpy()) done = done #or episode_length >= args.max_episode_length if episode_length >= args.max_episode_length: done = True reward_sum -= 30 reward_sum += reward #actions.append(action[0,0]) #if actions.count(actions[0])==actions.maxlen: # done=True #if reward!=0: # print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!') if done: hour = int( time.strftime("%H", time.gmtime(time.time() - start_time))) _min = int( time.strftime("%M", time.gmtime(time.time() - start_time))) print("Time {},episode reward {}, episode length {} ".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File = open(currentPath + '/record.txt', 'a+') File.write( "Time {},episode reward {}, episode length {} \n".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File.close() reward_sum = 0 episode_length = 0 #actions.clear() state = env.reset() torch.save(model.state_dict(), currentPath + '/A3C.t7') episode_number += 1 time.sleep(60) state = torch.from_numpy(state)
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) if not os.path.exists('models-a3c'): os.makedirs('models-a3c') path = 'models-a3c/model-{}.pth'.format(args.model_name) print('saving directory is', path) model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma) model.eval() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) reward_sum = 0 done = True action_stat = [0] * model.num_outputs start_time = time.time() episode_length = 0 for ep_counter in itertools.count(1): # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) torch.save(shared_model.state_dict(), path) print('saved model') atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() action_np = action[0, 0] action_stat[action_np] += 1 state_new, reward, done, info = env.step(action_np) dead = is_dead(info) if args.testing: atoms_prob = F.softmax(atoms_logit) value = model.get_v(atoms_prob, batch=False) atoms_prob = atoms_prob.squeeze().data.numpy() print('episode', episode_length, 'normal action', action_np, 'lives', info['ale.lives'], 'value', value) env.render() if ep_counter % 100 == 0: plt.plot(model.z, atoms_prob) plt.title('average v is {}'.format(value)) plt.show() state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += reward episode_length += 1 if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) print("actions stats real {}".format( action_stat[:model.num_outputs])) reward_sum = 0 episode_length = 0 state = env.reset() env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter) state = np.concatenate([state] * 4, axis=0) action_stat = [0] * model.num_outputs if not args.testing: time.sleep(60) state = torch.from_numpy(state)
help='how many training processes to use (default: 4)') parser.add_argument('--num-steps', type=int, default=20, metavar='NS', help='number of forward steps in A3C (default: 20)') parser.add_argument('--max-episode-length', type=int, default=10000, metavar='M', help='maximum length of an episode (default: 10000)') parser.add_argument('--env-name', default='PongDeterministic-v3', metavar='ENV', help='environment to train on (default: PongDeterministic-v3)') if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) env = create_atari_env(args.env_name) shared_model = ActorCritic( env.observation_space.shape[0], env.action_space) shared_model.share_memory() processes = [] p = mp.Process(target=test, args=(args.num_processes, args, shared_model)) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model)) p.start() processes.append(p) for p in processes: p.join()
class Agent(mp.Process): def __init__(self, global_actor_critic, optimizer, input_dims, nb_actions, gamma, lr, name, global_ep_index, env_id): super(Agent, self).__init__() self.local_actor_critic = ActorCritic(input_dims, nb_actions, gamma) self.global_actor_critic = global_actor_critic self.name = "w%02i" % name self.episode_index = global_ep_index self.env = gym.make(env_id) self.optimizer = optimizer def run(self): t_step = 1 while self.episode_index.value < EPISODES: done = False observation = self.env.reset() score = 0 self.local_actor_critic.clear_memory() while not done: action = self.local_actor_critic.choose_action(observation) observation_, reward, done, info = self.env.step(action) score += reward self.local_actor_critic.remember(observation, action, reward) if (t_step % T_MAX) == 0 or done: loss = self.local_actor_critic.calc_loss(done) self.optimizer.zero_grad() loss.backward() for local_param, global_param in zip( self.local_actor_critic.parameters(), self.global_actor_critic.parameters()): global_param._grad = local_param.grad self.optimizer.step() self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict()) self.local_actor_critic.clear_memory() t_step += 1 observation = observation_ with self.episode_index.get_lock(): self.episode_index.value += 1 print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)
def train(rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() avg_rew_win_size = 25 avg_rew = 0 state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() avg_rew_cnt = 0 # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward_sum += reward reward = max(min(reward, 1), -1) # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True with lock: counter.value += 1 if done: avg_rew = avg_rew + reward_sum if avg_rew_cnt % avg_rew_win_size == 0: print(" avg. episode reward {}".format(avg_rew / avg_rew_win_size)) avg_rew = 0 print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) episode_length = 0 reward_sum = 0 actions.clear() state = env.reset() avg_rew_cnt = avg_rew_cnt + 1 state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
class Params(): def __init__(self): self.lr = 0.0001 self.gamma = 0.99 self.tau = 1. self.seed = 1 self.num_processes = 16 self.num_steps = 20 self.max_episode_length = 10000 self.env_name = 'Breakout-v0' # Main run os.environ['OMP_NUM_THREADS'] = '1' params = Params() torch.manual_seed(params.seed) env = create_atari_env(params.env_name) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr) optimizer.share_memory() processes = [] p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) p.start() processes.append(p) for rank in range(0, params.num_processes): p = mp.Process(target=train, args=(rank, params, shared_model, optimizer)) p.start() processes.append(p) for p in processes: p.join()
if __name__ == '__main__': parser = argparse.ArgumentParser(description='CuLE') parser.add_argument('game', type=str, help='Atari ROM filename') parser.add_argument('--num-stack', type=int, default=4, help='number of images in a stack (default: 4)') args = parser.parse_args() num_stack = args.num_stack env = AtariEnv(args.game, num_envs=1) env.eval() model = ActorCritic(num_stack, env.action_space) shape = (args.num_stack, 84, 84) states = torch.ByteTensor(*shape).zero_() observation = env.reset()[0] states[-1] = downsample(observation).squeeze(-1) actions = env.minimal_actions() N = actions.size(0) options = {'noop': 0, 'right': 1, 'left': 2, 'down': 4, 'up': 8, ' ': 16} action_keys = [ 0, 1, 2, 4, 8, 16, 9, 10, 5, 6, 24, 17, 18, 20, 25, 26, 21, 22 ] action_names = ['NOOP', 'RIGHT', 'LEFT', 'DOWN', 'UP', 'FIRE', 'UPRIGHT', \ 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT', 'UPFIRE', 'RIGHTFIRE', \ 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', \
default=1) parser.add_argument( "--rnd", type=bool, help="Play against random agent (else against negamax)", default=False) opts = parser.parse_args() # Autodetect CUDA use_cuda = T.cuda.is_available() device = T.device("cuda" if use_cuda else "cpu") print('Device:', device) HIDDEN_SIZE = 256 env = ConnectX(switch_prob=0.5, random_agent=opts.rnd, test_mode=True) model = ActorCritic(env.observation_space.n, env.action_space.n, HIDDEN_SIZE) model.load_state_dict(T.load(opts.weights)) total_reward = 0 for _ in range(opts.num): state = env.reset() done = False while not done: state = T.FloatTensor(state.board).unsqueeze(0).to(device) dist, _ = model(state) dist_space = dist.sample() action = T.argmax(dist_space, dim=1, keepdim=True).cpu().numpy()[0] next_state, reward, done, _ = env.step(action)
metavar='O', help='use an optimizer without shared momentum.') parser.add_argument('--model-name', default='def', help='for saving the model') parser.add_argument('--load-dir', help='load model from path') parser.add_argument('--testing', default=False, help='to run model') if __name__ == '__main__': os.environ['OMP_NUM_THREADS'] = '1' args = parser.parse_args() print(args) torch.manual_seed(args.seed) env = create_atari_env(args.env_name) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space, args.num_skips) shared_model.share_memory() if args.no_shared: optimizer = None else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() if args.load_dir: filename = args.load_dir print('==> loading checkpoint {}'.format(filename)) checkpoint = torch.load(filename) shared_model.load_state_dict(checkpoint) print('==> loaded checkpoint {}'.format(filename))
def test(name, backend, env_name, rank, args, shared_model, counter, docker, train_mode=True): torch.manual_seed(args.seed + rank) if backend == 'unity3d': if docker: os.chdir('/mnt/code/') env = create_unity3d_env(train_mode=train_mode,\ file_name=env_name, \ worker_id=rank, seed=args.seed, \ docker_training=docker) elif backend == 'gym': env = create_atari_env(env_name) env.seed(args.seed + rank) else: print(f' [!]: {backend} is not a valid backend') raise ValueError print(env.action_space) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state).float() reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking history = {'num-steps': [], 'times': [], 'rewards': [], 'episode-length': []} actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable( state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1, keepdim=True)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: end = time.time() - start_time history['num-steps'].append(counter.value) history['times'].append(end) history['rewards'].append(reward_sum) history['episode-length'].append(episode_length) print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(end)), counter.value, counter.value / (end), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() if train_mode: history['weights'] = shared_model.state_dict() torch.save(history, f'{name}-history.t7') time.sleep(60) state = torch.from_numpy(state).float() env.close()
help='Enviorment ') parser.add_argument('--lstm-size',type=int,default=128,metavar='LSTM', help='lstm size') parser.add_argument('--loadmodel',type=int,default=0, help='whether to loadmodel') parser.add_argument('--starttime',type=int,default=0, help='start time') if __name__=='__main__': args=parser.parse_args() torch.manual_seed(args.seed) torch.set_num_threads(1) env=create_atari_env(args.env_name) shared_model=ActorCritic( env.observation_space.shape[0],env.action_space.n,args.lstm_size) if args.loadmodel>0: shared_model.load_state_dict(torch.load(os.getcwd()+'/A3C.t7')) shared_model.share_memory() processes= [] p=mp.Process(target=test, args=(args.num_processes,args,shared_model)) p.start() processes.append(p) for rank in range(args.num_processes): print(rank) p=mp.Process(target=train,args=(rank,args,shared_model)) p.start() processes.append(p)
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) episode_length = 0 while True: env.render() print('here') # env.render() episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) print('there') value, logit, (hx, cx) = model((Variable( state.unsqueeze(0), volatile=True), (hx, cx))) print('hi') prob = F.softmax(logit) # print(prob) action = prob.max(1, keepdim=True)[1].data.numpy() print(action) state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
class Agent(object): """Interacts and learns from the environment""" def __init__(self, num_agents, state_size, action_size): """ Initialize an Agent object Params ====== num_agent (int): number of agents state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size # self.model = ActorCriticPolicy(state_size, action_size, 256) self.model = ActorCritic(state_size, action_size, 256) self.optimizer = optim.Adam(self.model.parameters(), LR, eps=EPSILON) def compute_gaes(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95): values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns def compute_advantage(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95): values = values + [next_value] advantage = 0 returns = [] for step in reversed(range(len(rewards))): # G(t) = r + G(t+ delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae g_return = reward + GAMMA * next_return * done next_return = g_return # g_return = reward + GAMMA * g_return*done # Compute TD error td_error = reward + GAMMA * next_value - value # Compute advantages advantage = advantage * TAU * GAMMA * done + td_error def step(self, states, actions, values, rewards, log_probs, masks, next_value): # def compute_gaes(next_value, rewards, masks, values, gamma=0.99, tau=0.95): returns = self.compute_gaes(next_value, rewards, masks, values) returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantages = returns - values advantages = (advantages - advantages.mean()) / advantages.std() self.learn(ppo_epochs=10, mini_batch_size=32, states=states, actions=actions, log_probs=log_probs, returns=returns, advantages=advantages, clip_param=0.2) def step_(self, rollout): """ Compute advantage estimates at each time steps given a trajectory""" storage = [None] * (len(rollout) - 1) shape = (self.num_agents, 1) advantage = torch.Tensor(np.zeros(shape)) for i in reversed(range(len(rollout) - 1)): # rollout --> tuple ( s, a, p(a|s), r, dones, V(s) ) FOR ALL AGENT # rollout --> last row (s, none, none, none, pending_value) FOR ALL AGENT state, action, log_prob, reward, done, value = rollout[i] # last step - next_return = pending_value if i == len(rollout) - 2: next_return = rollout[i + 1][-1] state = torch.Tensor(state) action = torch.Tensor(action) reward = torch.Tensor(reward).unsqueeze(1) done = torch.Tensor(done).unsqueeze(1) next_value = rollout[i + 1][-1] # G(t) = r + G(t+1) g_return = reward + GAMMA * next_return * done next_return = g_return # g_return = reward + GAMMA * g_return*done # Compute TD error td_error = reward + GAMMA * next_value - value # Compute advantages advantage = advantage * TAU * GAMMA * done + td_error # Add (s, a, p(a|s), g, advantage) storage[i] = [state, action, log_prob, g_return, advantage] state, action, log_prob, g_return, advantage = map( lambda x: torch.cat(x, dim=0), zip(*storage)) advantage = (advantage - advantage.mean()) / advantage.std() # Check dimensions # print ("States :", states.size(0), " * ", states.size(1) ) # print ("Actions :", actions.size(0), " * ", actions.size(1) ) # print ("Log Prob :", log_prob.size(0), " * ", log_prob.size(1) ) # print ("Return :", g_return.size(0), " * ", g_return.size(1) ) # print ("Advantage :", advantage.size(0), " * ", advantage.size(1) ) self.learn(state, action, log_prob, g_return, advantage, self.num_agents) def act(self, states): """Given state as per current policy model, returns action, log probabilities and estimated state values""" dist, values = self.model(states) actions = dist.sample() log_probs = dist.log_prob(actions) log_probs = torch.sum(log_probs, dim=1, keepdim=True) return actions, log_probs, values, dist def sample(self, states, actions, log_probs, returns, advantages): """Randomly sample learning batches from trajectory""" rand_idx = np.random.randint(0, states.size(0), BATCH_SIZE) return states[rand_idx, :], actions[rand_idx, :], log_probs[ rand_idx, :], returns[rand_idx, :], advantages[rand_idx, :] def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], actions[rand_ids, :], log_probs[ rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] # def learn(self, states, actions, log_probs_old, returns, advantages, num_agents): def learn(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): for _ in range(ppo_epochs): # for state, action, old_log_probs, return_, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): state, action, old_log_probs, return_, advantage = self.sample( states, actions, log_probs, returns, advantages) _, new_log_probs, values, dist = self.act(state) entropy = dist.entropy().mean() ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (return_ - values).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), GRADIENT_CLIP) self.optimizer.step() def learn_(self, states, actions, log_probs_old, returns, advantages, num_agents): """ Optimize surrogate loss with policy and value parameters using given learning batches.""" for _ in range(NUM_EPOCHS): for _ in range(states.size(0) // BATCH_SIZE): state_samples, action_samples, log_prob_samples, return_samples, advantage_samples = self.sample( states, actions, log_probs_old, returns, advantages) dist, values = self.model(state_samples) log_probs = dist.log_prob(action_samples) log_probs = torch.sum(log_probs, dim=1, keepdim=True) entropy = dist.entropy().mean() ratio = (log_probs - log_prob_samples).exp() # Surrogate Objctive obj = ratio * advantage_samples # Clipped Surrogate Objective obj_clipped = ratio.clamp(1.0 - CLIP, 1.0 + CLIP) * advantage_samples # Compute policy loss: L = min[ r(θ), clip ( r(θ), 1-Ɛ, 1+Ɛ )*A ] - β * entropy policy_loss = -torch.min(obj, obj_clipped).mean(0) - BETA * entropy # Compute value loss: L = ( V(s) - V_t )^2 value_loss = (return_samples - values).pow(2).mean() # Optimize self.optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() nn.utils.clip_grad_norm_(self.model.parameters(), GRADIENT_CLIP) self.optimizer.step()
def test(rank, args, shared_model, counter, logger): console_f = logger.init_console_log_file() torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() max_score = 0 start_time = time.time() while True: if args.max_counter_num != 0 and counter.value > args.max_counter_num: if args.save_policy_models: logger.save_policy_model(shared_model, counter.value + 1) exit(0) # monitor counter value if counter.value % args.testing_every_counter > 1: continue counter_value = counter.value model.load_state_dict(shared_model.state_dict()) if args.save_policy_models: if counter_value % args.save_policy_models_every <= 5: logger.save_policy_model(shared_model, counter_value) state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) # actions = deque(maxlen=500) actions = deque(maxlen=1000) episode_length = 0 episode_count = 0 episode_rewards_sum = 0 episode_length_sum = 0 while True: episode_length += 1 # Sync with the shared model with torch.no_grad(): if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit, dim=1) action = prob.max(1, keepdim=True)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: episode_count += 1 episode_rewards_sum += reward_sum episode_length_sum += episode_length if episode_count == args.testing_episodes_num: print("Time {}, num steps {}, FPS {:.0f}, avg episode reward {}, avg episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter_value, counter_value / (time.time() - start_time), episode_rewards_sum/args.testing_episodes_num, episode_length_sum/args.testing_episodes_num)) logger.write_results_log(console_f, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter_value, counter_value / (time.time() - start_time), episode_rewards_sum / args.testing_episodes_num, episode_length_sum / args.testing_episodes_num) if args.save_max and (episode_rewards_sum / args.testing_episodes_num) >= max_score: max_score = episode_rewards_sum / args.testing_episodes_num logger.save_policy_model(shared_model, count="max_reward") break reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
if done: print("Time {}, episode reward {}, episode length {}". format(get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state) if __name__ == '__main__': env = create_atari_env(args.rom) # torch.manual_seed(SEED) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() # print (shared_model.conv1._parameters['weight'].data.is_cuda) optimizer = SharedAdam(shared_model.parameters(), lr=0.0001) optimizer.share_memory() if args.play: if os.path.isfile(args.play): print("=> loading checkpoint '{}'".format(args.play)) checkpoint = torch.load(args.play) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] shared_model.load_state_dict(checkpoint['state_dict']) #optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']))
def train(rank, shared_model, optimizer): """ :param rank: worker-ID :param shared_model: model to sync between workers :param optimizer: :return: """ # torch.manual_seed(SEED + rank) ac_steps = 20 max_episode_length = 10000 gamma = 0.99 tau = 1.0 max_grad_norm = 50.0 checkpoint_n = 20 env = create_atari_env(romname) env.seed(SEED + rank) state = env.reset() state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) model = ActorCritic(env.observation_space.shape[0], env.action_space) t = 0 done = True episodes = 0 reward_sum = 0 reward_sum1 = 0 start_time = time.time() best_reward = -999 isbest = 0 cx = hx = None while True: model.load_state_dict(shared_model.state_dict()) if done: # need to reset LSTM cell's input cx = Variable(torch.zeros(1, 256)).type(FloatTensor) hx = Variable(torch.zeros(1, 256)).type(FloatTensor) else: cx = Variable(cx.data) hx = Variable(hx.data) # basically this is to detach from previous comp graph states = [] values = [] log_probs = [] rewards = [] entropies = [] for i in range(ac_steps): t += 1 v, logit, (hx, cx) = model((state, (hx, cx))) states.append(state) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial().detach() # detach -- so the backprob will NOT go through multinomial() log_prob = log_prob.gather(1, action) action = action.data[0, 0] state, reward, done, _ = env.step(action) reward_sum += reward reward_sum1 += reward done = done or t >= max_episode_length if done: t_ = t t = 0 state = env.reset() episodes += 1 if episodes % 10 == 0: time_str = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print("Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {}". format(time_str, rank, episodes, reward_sum / 10.0, t_)) reward_sum = 0.0 if episodes % checkpoint_n == 0: ave_reward = reward_sum1 / checkpoint_n if best_reward < ave_reward: isbest = 1 best_reward = ave_reward print("Saving checkpoint Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {} best_reward {}". format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward)) checkpoint_fname = os.path.join( args.savedir, args.rom + '_worker' + str(rank) + '_' + str(episodes)) save_checkpoint({'epoch': episodes, 'average_reward': ave_reward, 'time': time.time(), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, isbest, checkpoint_fname) reward_sum1 = 0.0 state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) reward = max(min(reward, 1), -1) values.append(v) log_probs.append(log_prob) rewards.append(reward) if done: break # We reach here because either # i) an episode ends, such as game over # ii) we have explored certain steps into the future and now it is # time to look-back and summerise the if done: R = torch.zeros(1, 1).type(FloatTensor) else: value, _, _ = model((state, (hx, cx))) R = value.data values.append(Variable(R)) critic_loss = 0 actor_loss = 0 R = Variable(R) gae = 0 for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] advantage = R - values[i] # type: Variable critic_loss += 0.5 * advantage.pow(2) td_error = rewards[i] + gamma * values[i + 1].data - values[i].data gae = gae * gamma * tau + td_error actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i]) optimizer.zero_grad() total_loss = actor_loss + critic_loss * 0.5 # type: Variable total_loss.backward() # error occur torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class state = env.reset() # state is a numpy array of size 1*42*42, in black & white state = torch.from_numpy(state) # converting the numpy array into a torch tensor done = True # when the game is done episode_length = 0 # initializing the length of an episode to 0 while True: # repeat episode_length += 1 # incrementing the episode length by one model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps if done: # if it is the first iteration of the while loop or if the game was just done, then: cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero else: # else: cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable values = [] # initializing the list of values (V(S)) log_probs = [] # initializing the list of log probabilities rewards = [] # initializing the list of rewards entropies = [] # initializing the list of entropies for step in range(params.num_steps): # going through the num_steps exploration steps value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states prob = F.softmax(action_values) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b))) log_prob = F.log_softmax(action_values) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a)) entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x)) entropies.append(entropy) # storing the computed entropy action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action values.append(value) # storing the value V(S) of the state log_probs.append(log_prob) # storing the log prob of the action state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1 if done: # if the episode is done: episode_length = 0 # we restart the environment state = env.reset() # we restart the environment state = torch.from_numpy(state) # tensorizing the new state rewards.append(reward) # storing the new observed reward if done: # if we are done break # we stop the exploration and we directly move on to the next step: the update of the shared model R = torch.zeros(1, 1) # intializing the cumulative reward if not done: # if we are not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) # we initialize the cumulative reward with the value of the last shared state R = value.data # we initialize the cumulative reward with the value of the last shared state values.append(Variable(R)) # storing the value V(S) of the last reached state S policy_loss = 0 # initializing the policy loss value_loss = 0 # initializing the value loss R = Variable(R) # making sure the cumulative reward R is a torch Variable gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0 for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state) advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i] value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i)) policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss optimizer.zero_grad() # initializing the optimizer (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient optimizer.step() # running the optimization step
# Autodetect CUDA use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print('Device:', device) # Prepare environments envs = [make_env() for i in range(args.envs)] envs = MultiEnv(envs) if args.mp: envs = SubprocVecEnv(envs) env = OhlcvEnv(WINDOW_SIZE, './data/test/') obs_ = env.reset() num_inputs = env.observation_space.shape num_outputs = env.action_space.n model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE, std=0.0).to(device) print(model) optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) frame_idx = 0 train_epoch = 0 best_reward = None state = envs.reset() early_stop = False while not early_stop: log_probs = [] values = [] states = []
import gym import numpy as np from time import sleep from model import ActorCritic from helpers import load_model, worker lr = 0.001 gamma = 0.99 gae = 0.9 clc = 0.1 step_update = 50 ppo_epsilon = 0.2 input_dim = 4 shared_hidden0 = 25 shared_hidden1 = 50 critic_hidden = 25 output_dim_actor = 2 output_dim_critic = 1 model = ActorCritic(input_dim, shared_hidden0, shared_hidden1, critic_hidden, output_dim_actor, output_dim_critic) filename = '*****@*****.**' # filename = 'actor_critic.pt' model = load_model(model, filename) params = {'epochs': 1, 'n_workers': 0, 'lr': lr} worker(model, params, None, 0, render=True, train=False, max_eps=1000)
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) # CUDA if args.use_cuda: torch.cuda.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) gpu_id = 0 if args.use_cuda else -1 # todo 0 代表第一个显卡 if gpu_id >= 0: model = model.cuda() model.train() if not args.on_policy: # Normalise memory capacity by number of training processes memory = EpisodicReplayMemory( args.memory_capacity // args.num_processes, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: avg_hx = torch.zeros(1, args.hidden_size) avg_cx = torch.zeros(1, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(1, args.hidden_size).cuda() cx = torch.zeros(1, args.hidden_size).cuda() else: hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) if gpu_id >= 0: state = state.cuda() done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) # shared 模型在 CPU上, 需要转换 if gpu_id >= 0: to_avg_state = state.cpu() else: to_avg_state = state average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( to_avg_state, (avg_hx, avg_cx)) # if gpu_id >= 0: # average_policies = average_policies.cuda() # Sample action action = torch.multinomial(policy, 1)[0, 0] # Step next_state, reward, done, _ = env.step(action.item()) next_state = state_to_tensor(next_state) if gpu_id >= 0: next_state = next_state.cuda() reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(state, action, reward, policy.detach()) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies), (policy, Q, V, torch.LongTensor([[action]]), torch.Tensor([[reward]]), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = torch.zeros(1, 1) if not args.on_policy: # Save terminal state for offline training memory.append(state, None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(state, (hx, cx)) Qret = Qret.detach().cpu() # Train the network on-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state avg_hx = torch.zeros(args.batch_size, args.hidden_size) avg_cx = torch.zeros(args.batch_size, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(args.batch_size, args.hidden_size).cuda() cx = torch.zeros(args.batch_size, args.hidden_size).cuda() else: hx = torch.zeros(args.batch_size, args.hidden_size) cx = torch.zeros(args.batch_size, args.hidden_size) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i]), 0) action = torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ]).unsqueeze(1) reward = torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ]).unsqueeze(1) old_policy = torch.cat( tuple(trajectory.policy for trajectory in trajectories[i]), 0) # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( state, (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i + 1]), 0) done = torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1) # Do forward pass for all transitions _, _, Qret, _ = model(next_state, (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach().cpu() # Train the network off-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
def train(rank, args, share_model, counter, lock): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) optimizer = optim.Adam(share_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.FloatTensor(state) done = True # reward_sum = 0 episode_length = 0 while True: model.load_state_dict(share_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) # print('reward', reward) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) # reward_sum += reward # print(reward) with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() state = torch.FloatTensor(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: # print('rank: ', rank) # print('reward: ', reward_sum) # reward_sum = 0 break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, share_model) optimizer.step()
loss.backward() for local_param, global_param in zip( self.local_actor_critic.parameters(), self.global_actor_critic.parameters()): global_param._grad = local_param.grad self.optimizer.step() self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict()) self.local_actor_critic.clear_memory() t_step += 1 observation = observation_ with self.episode_index.get_lock(): self.episode_index.value += 1 print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score) if __name__ == '__main__': lr = 1e-4 env_id = 'CartPole-v0' nb_actions = 2 input_dims = [4] global_actor_critic = ActorCritic(input_dims, nb_actions) global_actor_critic.share_memory() optim = SharedAdam(global_actor_critic.parameters(), lr=lr, betas=(0.92, 0.999)) global_ep = mp.Value('i', 0) workers = [ Agent(global_actor_critic, optim, input_dims, nb_actions, gamma=0.99, lr=lr, name=i, global_ep_index=global_ep, env_id=env_id) for i in range(mp.cpu_count())] [w.start() for w in workers] [w.join() for w in workers]
def test(rank, args, shared_model, counter, loggers, kill): counter, steps, max_episodes = counter torch.manual_seed(args.seed + rank) env = create_vizdoom_env(args.config_path, args.test_scenario_path) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.spaces[0].shape[0], env.action_space, args.topology) model.eval() state = env.reset() reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256), torch.zeros(1, 256))) actions = deque(maxlen=100) episode_length = 0 episode_counter = 0 obs_index = 0 obs_history = [] pose_history = [] goal_loc = env.goal() model.load_state_dict(shared_model.state_dict()) while not kill.is_set(): if steps.value > args.max_episode_steps: break if episode_counter > max_episodes: break try: episode_start_time = time.time() episode_length += 1 value, logit, _, _, hidden = model((state_to_torch(state), hidden)) prob = F.softmax(logit) action = prob.max(1, keepdim=True)[1].data.numpy() for i in range(4): state, reward, done, _ = env.step(action[0, 0], steps=1) reward_sum += reward if done: break else: obs_frame = (np.moveaxis(state[0], 0, -1) * 255).astype( np.uint8) if isinstance(obs_history, list): obs_history.append(obs_frame) else: obs_history[obs_index, :, :, :] = obs_frame obs_index += 1 pose_history.append(env.pose()) # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: if isinstance(obs_history, list): obs_history = np.array(obs_history) if loggers: loggers['test_reward'](env.game.get_total_reward(), episode_counter) loggers['video'](video(env.wad, env.current_map, goal_loc, obs_history, pose_history), episode_counter) loggers['test_time'](time.time() - episode_start_time, episode_counter) print( "Time {}, num episodes {}, FPS {:.0f}, episode reward {}, episode length {}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() obs_index = 0 pose_history = [] goal_loc = env.goal() hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256), torch.zeros(1, 256))) time.sleep(args.eval_interval) model.load_state_dict(shared_model.state_dict()) episode_counter += 1 except Exception as err: kill.set() raise err
def test(args, shared_model): action_map = _set_action_map() env = FixedEnvWrap() # time.sleep(10) model = ActorCritic() model.load_state_dict(shared_model.state_dict()) model.eval() state = env.reset() training_time = 0 vis = visdom.Visdom(env='final') line_plot = vis.line(Y=np.array([0]), opts=dict(xlabel='testing count', ylabel='average reward', title='ali-v1')) start = time.time() vis_count = 0 while True: video_count = 1 reward_all_sum = 0 reward_all = 0 reward_all_ave = 0 reward_gop = 0 action = 3 last_action = 3 # update model before testing all trace files # time.sleep(5) print('load updated model') model.load_state_dict(shared_model.state_dict()) while True: # get the reward for one gop while True: _, done, decision_flag = env.step_gop(action) if decision_flag or done: reward_gop = env.get_reward_gop() state = env.get_state_gop() break else: continue # print('testing') # get action from model last_action = action with torch.no_grad(): state = torch.FloatTensor(state) logit, _ = model( state.view(-1, args.s_gop_info, args.s_gop_len)) prob = F.softmax(logit, dim=1) _, action = torch.max(prob, 1) action = action.data.numpy()[0] bitrate, target_buffer = action_map[last_action] # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop)) if done: print("video count %d, reward is %.5f" % (video_count, reward_all)) # reward_all_sum += reward_all / 100 reward_all_sum += reward_all video_count += 1 if reward_all < 0: print('bad model ! just break this loop') reward_all_ave = 0 break if video_count > env.traces_len * 2: reward_all_ave = reward_all_sum / video_count break action = 3 last_action = 3 reward_all = 0 reward_all += reward_gop # update the figure of average reward of all testing files vis_count += 1 reward_all_ave = max(reward_all_ave, 0) vis.line(Y=np.array([reward_all_ave]), X=np.array([vis_count]), win=line_plot, update='append') path = 'ali-v1/actor.pt-' + str(vis_count) torch.save(model.state_dict(), path) end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) print("average reward of traces are: ", reward_all_ave) print('saved one model in epoch:', vis_count)
metavar='ENV', help='environment to train on (default: Breakout-v0)') parser.add_argument('--render', default=False, action='store_true', help='render the environment') if __name__ == '__main__': args = parser.parse_args() #torch.manual_seed(args.seed) torch.set_num_threads(1) env = gym.make(args.env_name) global_model = ActorCritic(env.action_space.n) global_model.share_memory() local_model = ActorCritic(env.action_space.n) optimizer = AsyncAdam(global_model.parameters(), local_model.parameters(), lr=args.lr) processes = [] for rank in range(args.num_processes): p = mp.Process(target=train, args=(rank, args, global_model, local_model, optimizer)) p.start() processes.append(p) for p in processes: p.join()
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name) #getting the environment env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length+=1 model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1,256)) hx = Variable(torch.zeros(1,256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(params.num_steps): value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(action_values) log_prob = F.log_softmax(action_values) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) values.append(value) log_probs.append(log_prob) state, reward, done = env.step(action.numpy()) done = (done or episode_length >= params.max_episode_length) reward = max(min(reward,1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) rewards.append(reward) if done: break R = torch.zeros(1,1) if not done: value, _, _ = model.((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1,1) for i in reversed(range(len(rewards))): R = params.gamma*R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) TD = rewards[i] + params.gamma * values[i+1].data - values[i].data gae = gae * params.gamma * params.tau + TD policy_loss = policy_loss - log_probs[i]*Variable(gae) - 0.01*entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = WrapEnv(args.env_name) model = ActorCritic(4, env.num_actions, args.num_skips) model.eval() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) reward_sum = 0 done = True action_stat = [0] * (model.n_real_acts + model.n_aux_acts) start_time = time.time() episode_length = 0 for ep_counter in itertools.count(1): # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) if not os.path.exists('model-a3c-aux'): os.makedirs('model-a3c-aux') torch.save(shared_model.state_dict(), 'model-a3c-aux/model-{}.pth'.format(args.model_name)) print('saved model') value, logit = model(Variable(state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() action_np = action[0, 0] action_stat[action_np] += 1 if action_np < model.n_real_acts: state_new, reward, done, info = env.step(action_np) if args.testing: print('episode', episode_length, 'normal action', action_np, 'lives', info['ale.lives']) env.render() state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += reward episode_length += 1 else: state = state.numpy() for _ in range(action_np - model.n_real_acts + 2): state_new, rew, done, info = env.step( 0) # instead of random perform NOOP=0 if args.testing: print('episode', episode_length, 'no_op action', action_np, 'lives', info['ale.lives']) # env.render() state = np.append(state[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += rew episode_length += 1 if done: break if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) print("actions stats real {}, aux {}".format( action_stat[:model.n_real_acts], action_stat[model.n_real_acts:])) reward_sum = 0 episode_length = 0 state = env.reset() state = np.concatenate([state] * 4, axis=0) action_stat = [0] * (model.n_real_acts + model.n_aux_acts) if not args.testing: time.sleep(60) state = torch.from_numpy(state)
def train(rank, args, shared_model, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, T, shared_model): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.eval() can_test = True # Test flag t_start = 1 # Test step counter to check against global counter rewards, steps = [], [] # Rewards and steps for plotting l = str(len(str(args.T_max))) # Max num. of digits for logging steps done = True # Start new episode while T.value() <= args.T_max: if can_test: t_start = T.value() # Reset counter # Evaluate over several episodes and average results avg_rewards, avg_episode_lengths = [], [] for _ in range(args.evaluation_episodes): while True: # Reset or pass on hidden state if done: # Sync with shared model every episode model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.hidden_size), volatile=True) cx = Variable(torch.zeros(1, args.hidden_size), volatile=True) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 reward_sum = 0 # Optionally render validation states if args.render: env.render() # Calculate policy policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach())) # Break graph for memory efficiency # Choose action greedily action = policy.max(1)[1].data[0, 0] # Step state, reward, done, _ = env.step(action) state = state_to_tensor(state) reward_sum += reward done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Log and reset statistics at the end of every episode if done: avg_rewards.append(reward_sum) avg_episode_lengths.append(episode_length) break print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3], t_start, sum(avg_rewards) / args.evaluation_episodes, sum(avg_episode_lengths) / args.evaluation_episodes)) if args.evaluate: return rewards.append(avg_rewards) # Keep all evaluations steps.append(t_start) plot_line(steps, rewards) # Plot rewards torch.save(model.state_dict(), 'model.pth') # Save model params can_test = False # Finish testing else: if T.value() - t_start >= args.evaluation_interval: can_test = True time.sleep(0.001) # Check if available to test every millisecond env.close()