def main(): #env args = config() mp.set_start_method("spawn") os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = "" env = create_atari_env(args.env_name) shared_model = AcotrCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() processes = [] counter = mp.Value('i', 0) lock = mp.Lock() p = mp.Process(target=test, args=(args.num_processes, args, shared_model, counter, "./log/")) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model, counter, lock, optimizer)) p.start() processes.append(p) for p in processes: p.join()
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) if done and counter.value > args.max_steps: test_final(shared_model, env, args) save_model(shared_model, args) exit() with torch.no_grad(): value, logit = model(state.unsqueeze(0)) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print( "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) # asynchronizing the test agent env = create_atari_env(params.env_name, video=True) # running an environment with a video env.seed(params.seed + rank) # asynchronizing the environment model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating one model model.eval() # putting the model in "eval" model because it won't be trained state = env.reset() # getting the input images as numpy arrays state = torch.from_numpy(state) # converting them into torch tensors reward_sum = 0 # initializing the sum of rewards to 0 done = True # initializing done to True start_time = time.time() # getting the starting time to measure the computation time actions = deque(maxlen=100) # cf https://pymotw.com/2/collections/deque.html episode_length = 0 # initializing the episode length to 0 while True: # repeat episode_length += 1 # incrementing the episode length by one if done: # synchronizing with the shared model (same as train.py) model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() # the test agent does not explore, it directly plays the best action state, reward, done, _ = env.step(action[0, 0]) # done = done or episode_length >= params.max_episode_length reward_sum += reward if done: # printing the results at the end of each part print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 # reinitializing the sum of rewards episode_length = 0 # reinitializing the episode length actions.clear() # reinitializing the actions state = env.reset() # reinitializing the environment time.sleep(60) # doing a one minute break to let the other agents practice (if the game is done) state = torch.from_numpy(state) # new state and we continue
def __init__(self, sess, GameName, name, seed, rank, globalAC, OPT, coord, UPDATE_GLOBAL_ITER, MAX_GLOBAL_EP, GAMMA, ENTROPY_BETA, lstm=True): """Initialize the object of the class Worker. Args: sess: the running session GameName: name of game name: name of the worker/scope globalAC: the global net OPT_A: the optimizer of actor OPT_C: the optimizer of critic coord: training coordinator UPDATE_GLOBAL_ITER: the number of updating steps MAX_GLOBAL_EP: the maximum of steps GAMMA: decay rate on the contribution of past scores ENTROPY_BETA: used to calculate the entropy loss lstm: whether to use LSTM """ self.sess = sess self.name = name self.lstm = lstm self.env = create_atari_env(GameName) self.env.seed(seed + rank) shape = self.env.observation_space.shape if lstm: self.AC = A3CNet_LSTM( self.env, name, sess, OPT, ENTROPY_BETA, globalAC, input_shape=[None, shape[1], shape[2], shape[0]]) else: self.AC = A3CNet_CONV( self.env, name, sess, OPT, ENTROPY_BETA, globalAC, input_shape=[None, shape[1], shape[2], shape[0]]) self.coord = coord self.T0 = time.time() self.UPDATE_GLOBAL_ITER = UPDATE_GLOBAL_ITER self.MAX_GLOBAL_EP = MAX_GLOBAL_EP self.GAMMA = GAMMA
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def monitor(rank, args, shared_model): env = create_atari_env(args.env_name) env = wrappers.Monitor(env, './video/pong-a3c', video_callable=lambda count: count % 30 == 0, force=True) model = ActorCritic(env.observation_space.shape[0], env.action_space) # eval mode model.eval() # init state = env.reset() state = torch.from_numpy(state) reward_sum = 0 episode_length = 0 done = True start_time = time.time() while True: env.render() episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) # lstm's param hx = Variable(torch.zeros(1, 256), volatile=True) # lstm's param else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) # unsqueeze(0)後tensor的size會從1x42x42 -> 1x1x42x42 value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) # 直接選機率最大的動作 action = prob.max(1, keepdim=True)[1].data.numpy() state, reward, done, _ = env.step(action[0][0]) done = done or episode_length >= args.max_episode_length reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) # reset reward_sum = 0 episode_length = 0 state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(shared_model, render=0): # torch.manual_seed(rank) env = create_atari_env(args.rom) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 cx = hx = None while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable( state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx))) prob = F.softmax(logit) # print logit.data.numpy() action = prob.max(1, keepdim=True)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0, 0]) if render == 1: env.render() time.sleep(0.03) done = done or episode_length >= 10000 reward_sum += reward # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, episode reward {}, episode length {}". format(get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) # Test ajanını asenkron yapmak için env = create_atari_env(params.env_name, video=True) # Ortamı video ile oynatmak için env.seed(params.seed + rank) # Ortamı asenkron yapmak için model = ActorCritic(env.observation_space.shape[0], env.action_space) # Modelin oluşturulması model.eval() # Modelin eğitim yapmaması için state = env.reset() # input resmini numpy array olarak alıyoruz. state = torch.from_numpy(state) # Bunu torch tensörüne çeviriyoruz. reward_sum = 0 done = True start_time = time.time() # Başlangıç zamanı actions = deque(maxlen=100) # https://pymotw.com/2/collections/deque.html episode_length = 0 while True: episode_length += 1 # Bölüm uzunluğunu birer birer arttırıyoruz. if done: # Eğitim modundaki gibi paylaşımlı model ile senkronize hale getiriyoruz. model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy( ) # Test Ajanı keşif yapmadan doğrudan en iyi aksiyonu kullanarak oynu oynar. state, reward, done, _ = env.step(action[ 0, 0]) # done = done or episode_length >= params.max_episode_length reward_sum += reward if done: # Her bölümün sonunda sonucu yazdırır. print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) # Öbür ajanları beklemek için 1 dk beklemesi için. state = torch.from_numpy( state) # Yeni durum (state) oluşturup devam eder.
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if done: save(model, 'brain.pkl') model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0]) reward_sum += reward if done: f = open("Statistics.txt", 'a') f.write(str(reward_sum) + " " + str(episode_length) + "\n") f.close() print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def test(rank, params, shared_model): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name, video=True) env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256), volatile=True) hx = Variable(torch.zeros(1, 256), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, action_value, (hx, cx) = model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(action_value) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) reward_sum += reward if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep( 60 ) # 60 seconds break to allow the other agents to test the environment state = torch.from_numpy(state)
def initEnv(): env = create_atari_env('PongDeterministic-v4') state = env.reset() return env, state
class Params(): def __init__(self): self.lr = 0.0001 self.gamma = 0.99 self.tau = 1. self.seed = 1 self.num_processes = 16 self.num_steps = 20 self.max_episode_length = 10000 self.env_name = 'Breakout-v0' # Main run os.environ['OMP_NUM_THREADS'] = '1' params = Params() torch.manual_seed(params.seed) env = create_atari_env(params.env_name) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr) optimizer.share_memory() processes = [] p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) p.start() processes.append(p) for rank in range(0, params.num_processes): p = mp.Process(target=train, args=(rank, params, shared_model, optimizer)) p.start() processes.append(p) for p in processes: p.join()
def train(rank, shared_model, optimizer): """ :param rank: worker-ID :param shared_model: model to sync between workers :param optimizer: :return: """ # torch.manual_seed(SEED + rank) ac_steps = 20 max_episode_length = 10000 gamma = 0.99 tau = 1.0 max_grad_norm = 50.0 checkpoint_n = 20 env = create_atari_env(romname) env.seed(SEED + rank) state = env.reset() state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) model = ActorCritic(env.observation_space.shape[0], env.action_space) t = 0 done = True episodes = 0 reward_sum = 0 reward_sum1 = 0 start_time = time.time() best_reward = -999 isbest = 0 cx = hx = None while True: model.load_state_dict(shared_model.state_dict()) if done: # need to reset LSTM cell's input cx = Variable(torch.zeros(1, 256)).type(FloatTensor) hx = Variable(torch.zeros(1, 256)).type(FloatTensor) else: cx = Variable(cx.data) hx = Variable(hx.data) # basically this is to detach from previous comp graph states = [] values = [] log_probs = [] rewards = [] entropies = [] for i in range(ac_steps): t += 1 v, logit, (hx, cx) = model((state, (hx, cx))) states.append(state) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial().detach() # detach -- so the backprob will NOT go through multinomial() log_prob = log_prob.gather(1, action) action = action.data[0, 0] state, reward, done, _ = env.step(action) reward_sum += reward reward_sum1 += reward done = done or t >= max_episode_length if done: t_ = t t = 0 state = env.reset() episodes += 1 if episodes % 10 == 0: time_str = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print("Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {}". format(time_str, rank, episodes, reward_sum / 10.0, t_)) reward_sum = 0.0 if episodes % checkpoint_n == 0: ave_reward = reward_sum1 / checkpoint_n if best_reward < ave_reward: isbest = 1 best_reward = ave_reward print("Saving checkpoint Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {} best_reward {}". format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward)) checkpoint_fname = os.path.join( args.savedir, args.rom + '_worker' + str(rank) + '_' + str(episodes)) save_checkpoint({'epoch': episodes, 'average_reward': ave_reward, 'time': time.time(), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, isbest, checkpoint_fname) reward_sum1 = 0.0 state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) reward = max(min(reward, 1), -1) values.append(v) log_probs.append(log_prob) rewards.append(reward) if done: break # We reach here because either # i) an episode ends, such as game over # ii) we have explored certain steps into the future and now it is # time to look-back and summerise the if done: R = torch.zeros(1, 1).type(FloatTensor) else: value, _, _ = model((state, (hx, cx))) R = value.data values.append(Variable(R)) critic_loss = 0 actor_loss = 0 R = Variable(R) gae = 0 for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] advantage = R - values[i] # type: Variable critic_loss += 0.5 * advantage.pow(2) td_error = rewards[i] + gamma * values[i + 1].data - values[i].data gae = gae * gamma * tau + td_error actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i]) optimizer.zero_grad() total_loss = actor_loss + critic_loss * 0.5 # type: Variable total_loss.backward() # error occur torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
# if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, episode reward {}, episode length {}". format(get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state) if __name__ == '__main__': env = create_atari_env(args.rom) # torch.manual_seed(SEED) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() # print (shared_model.conv1._parameters['weight'].data.is_cuda) optimizer = SharedAdam(shared_model.parameters(), lr=0.0001) optimizer.share_memory() if args.play: if os.path.isfile(args.play): print("=> loading checkpoint '{}'".format(args.play)) checkpoint = torch.load(args.play) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] shared_model.load_state_dict(checkpoint['state_dict']) #optimizer.load_state_dict(checkpoint['optimizer'])
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class state = env.reset() # state is a numpy array of size 1*42*42, in black & white state = torch.from_numpy(state) # converting the numpy array into a torch tensor done = True # when the game is done episode_length = 0 # initializing the length of an episode to 0 while True: # repeat episode_length += 1 # incrementing the episode length by one model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps if done: # if it is the first iteration of the while loop or if the game was just done, then: cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero else: # else: cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable values = [] # initializing the list of values (V(S)) log_probs = [] # initializing the list of log probabilities rewards = [] # initializing the list of rewards entropies = [] # initializing the list of entropies for step in range(params.num_steps): # going through the num_steps exploration steps value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states prob = F.softmax(action_values) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b))) log_prob = F.log_softmax(action_values) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a)) entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x)) entropies.append(entropy) # storing the computed entropy action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action values.append(value) # storing the value V(S) of the state log_probs.append(log_prob) # storing the log prob of the action state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1 if done: # if the episode is done: episode_length = 0 # we restart the environment state = env.reset() # we restart the environment state = torch.from_numpy(state) # tensorizing the new state rewards.append(reward) # storing the new observed reward if done: # if we are done break # we stop the exploration and we directly move on to the next step: the update of the shared model R = torch.zeros(1, 1) # intializing the cumulative reward if not done: # if we are not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) # we initialize the cumulative reward with the value of the last shared state R = value.data # we initialize the cumulative reward with the value of the last shared state values.append(Variable(R)) # storing the value V(S) of the last reached state S policy_loss = 0 # initializing the policy loss value_loss = 0 # initializing the value loss R = Variable(R) # making sure the cumulative reward R is a torch Variable gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0 for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state) advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i] value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i)) policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss optimizer.zero_grad() # initializing the optimizer (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient optimizer.step() # running the optimization step
def make_env(): env = create_atari_env(FLAGS.env) return env
ap = model.output(s) a = int(np.argmax(ap)) s_, r, done, info = env.step(a) score += r frame += 1 s = s_ if done: break return score, frame #game_Name = 'PongDeterministic-v4' #game_Name = 'AlienDeterministic-v4' game_Name = 'YarsRevengeDeterministic-v4' env = create_atari_env(game_Name) shape = env.observation_space.shape N_A = env.action_space.n # saver = tf.train.import_meta_graph('modellstm/model/model.ckpt.meta') SESS = tf.Session() # saver.restore(SESS, "./model15k/model.ckpt") # 注意路径写法 # graph = tf.get_default_graph() ckpt_path = "./modellstm/model/model.ckpt" input = tf.placeholder(tf.float32, [1, 42, 42, 1], 's') #graph.get_operation_by_name('s_1').outputs[0]# # var_to_shape_map = reader.get_variable_to_shape_map() # for key in var_to_shape_map: # if key.find('Global_Net/actor') != -1: # print("tensor_name: ", key)
parser.add_argument('--num-processes', type=int, default=4, metavar='N', help='how many training processes to use (default: 4)') parser.add_argument('--num-steps', type=int, default=20, metavar='NS', help='number of forward steps in A3C (default: 20)') parser.add_argument('--max-episode-length', type=int, default=10000, metavar='M', help='maximum length of an episode (default: 10000)') parser.add_argument('--env-name', default='PongDeterministic-v3', metavar='ENV', help='environment to train on (default: PongDeterministic-v3)') if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) env = create_atari_env(args.env_name) shared_model = ActorCritic( env.observation_space.shape[0], env.action_space) shared_model.share_memory() processes = [] p = mp.Process(target=test, args=(args.num_processes, args, shared_model)) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model)) p.start() processes.append(p) for p in processes:
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) if not os.path.exists('models-a3c'): os.makedirs('models-a3c') path = 'models-a3c/model-{}.pth'.format(args.model_name) print('saving directory is', path) model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma) model.eval() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) reward_sum = 0 done = True action_stat = [0] * model.num_outputs start_time = time.time() episode_length = 0 for ep_counter in itertools.count(1): # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) torch.save(shared_model.state_dict(), path) print('saved model') atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True)) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() action_np = action[0, 0] action_stat[action_np] += 1 state_new, reward, done, info = env.step(action_np) dead = is_dead(info) if args.testing: atoms_prob = F.softmax(atoms_logit) value = model.get_v(atoms_prob, batch=False) atoms_prob = atoms_prob.squeeze().data.numpy() print('episode', episode_length, 'normal action', action_np, 'lives', info['ale.lives'], 'value', value) env.render() if ep_counter % 100 == 0: plt.plot(model.z, atoms_prob) plt.title('average v is {}'.format(value)) plt.show() state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward_sum += reward episode_length += 1 if done: print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) print("actions stats real {}".format( action_stat[:model.num_outputs])) reward_sum = 0 episode_length = 0 state = env.reset() env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter) state = np.concatenate([state] * 4, axis=0) action_stat = [0] * model.num_outputs if not args.testing: time.sleep(60) state = torch.from_numpy(state)
def train(rank, shared_model, optimizer): """ :param rank: worker-ID :param shared_model: model to sync between workers :param optimizer: :return: """ # torch.manual_seed(SEED + rank) ac_steps = 20 # The amount of steps before you review max_episode_length = 10000 # The game will stop after this amount of time and maybe re run the game? gamma = 0.99 tau = 1.0 max_grad_norm = 50.0 # Limit the direction of gradient travel within the queue. Anything outside the queue is cut checkpoint_n = 20 # To see the model after this many n. Can increase this number if have a shit comp env = create_atari_env( romname ) # enage game. romname is depending on the game of your choice. env.seed(SEED + rank) # For the problem to occur again? LOOK THIS UP state = env.reset() # Allow torch to handle pixel data. Don't understrand squeeze. FloatTensor - Tensor is an array, therefore array of float. state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) # Selecting model, with this size of input and that kind of output model = ActorCritic(env.observation_space.shape[0], env.action_space) t = 0 done = True # Starting from a state when gameover is true! episodes = 0 reward_sum = 0 reward_sum1 = 0 start_time = time.time() best_reward = -999 isbest = 0 cx = hx = None while True: model.load_state_dict(shared_model.state_dict( )) # Pull the up to date model from the shared model if done: # need to reset LSTM cell's input # the LSTM units need their own output to feed into next step # input (hence the name of the kind: recurrent neural nets). # At the beginning of an episode, to get things started, # we need to allocate some initial values in the required format, # i.e. the same size as the output of the layer. # # see http://pytorch.org/docs/master/_modules/torch/nn/modules/rnn.html#LSTM # for details # # Optionally, you can remove LSTM to simplify the code # Think: what is the possible loss? cx = Variable(torch.zeros(1, 256)).type( FloatTensor ) # torch.zeros - setting the values to all zeros since there's nothing there yet hx = Variable(torch.zeros(1, 256)).type(FloatTensor) else: cx = Variable( cx.data) # takes the last computed value for the next input hx = Variable( hx.data ) # basically this is to detach from previous comp graph states = [] values = [] log_probs = [] rewards = [] entropies = [] for i in range(ac_steps): # Running through the 20 steps t += 1 v, logit, (hx, cx) = model( (state, (hx, cx)) ) # When you run model, it will return you 4 values -> store those 4 values in v, logit, etc. states.append(state) prob = F.softmax(logit) # The gradient descent thing log_prob = F.log_softmax( logit) # Do it again, a lot to make sure its correct entropy = -(log_prob * prob).sum( 1, keepdim=True ) # To increase diversity of our choice (part of e-greedy?) entropies.append(entropy) # detach - anything compute with pytorch will drag a trail behind it. When get gradient descent, the calculation will race with the result. We do not want the descent to chase it randomly, so we just detach it. !Do not need to modify this function when modify the code. action = prob.multinomial().detach( ) # detach -- so the backprob will NOT go through multinomial() # use the current action as an index to get the # corresponding log probability log_prob = log_prob.gather( 1, action ) # allow you to simultenously take probability of many actions. action = action.data[ 0, 0] # Extract the variables out of the integer. Turning it from a torch integer to a "normal" integer # Accept what was given by the action, does it things? and the env will return the 4 following; state, reward, done # _ is something that we don't care about but since env.step is returning 4 values so we just have to have something to take it. state, reward, done, _ = env.step(action) reward_sum += reward reward_sum1 += reward # reason why store reward sum twice just for re-assurance done = (done or t >= max_episode_length) if done: t_ = t t = 0 state = env.reset() episodes += 1 if episodes % 10 == 0: time_str = time.strftime( "%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) print("Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {}".format(time_str, rank, episodes, reward_sum / 10.0, t_)) reward_sum = 0.0 if episodes % checkpoint_n == 0: ave_reward = reward_sum1 / checkpoint_n if best_reward < ave_reward: isbest = 1 best_reward = ave_reward print("Saving checkpoint Time {}, worker-{} episode {} " "mean episode reward {}, " "episode length {} best_reward {}".format( get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward)) checkpoint_fname = os.path.join( args.savedir, args.rom + '_worker' + str(rank) + '_' + str(episodes)) save_checkpoint( { 'epoch': episodes, 'average_reward': ave_reward, 'time': time.time(), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, isbest, checkpoint_fname) reward_sum1 = 0.0 state = Variable( torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False) reward = max(min(reward, 1), -1) values.append(v) log_probs.append(log_prob) # Keep record rewards.append(reward) if done: break # We reach here because either # i) an episode ends, such as game over # ii) we have explored certain steps into the future and now it is # time to look-back and summerise the if done: R = torch.zeros(1, 1).type( FloatTensor ) # If game over, the game over stage receive a reward of 0 else: value, _, _ = model( (state, (hx, cx)) ) # if its not game over, then we will use the model to evaluate the reward R = value.data values.append(Variable(R)) critic_loss = 0 actor_loss = 0 R = Variable(R) gae = 0 for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] # R - longterm reward advantage = R - values[ i] # type: Variable, advantage against the average # Compare the actual long-term reward. Note: we are reversing the # experience of a complete trajectory. If the full length is 100 # (time indexes are among 0, 1, 2, ..., 99), and now i=50, that means # we have processed all information in steps, 51, 52, ..., 99 # and R will contain the actual long term reward at time step 51 at # the beginning of this step. The above computation injects the reward # information in step 50 to R. Now R is the long-term reward at this # step. # # So-called advantage is then the "unexpected gain/loss". It forms the base # of evaluating the action taken at this step (50). # # critic_loss accumulates those "exceptional gain/loss" so that later we will # adjust our expectation for each state and reduce future exceptions (to better # evaluate actions, say, the advantage agains expectation is only meaningful # when the expectation itself is meaningful). critic_loss += 0.5 * advantage.pow(2) # Generalized Advantage Estimation # see https://arxiv.org/abs/1506.02438 # we can use advantage in the computation of the direction to adjust policy, # but the manipulation here improves stability (as claims by the paper). # # Note advantage implicitly contributes to GAE, since it helps # achieve a good estimation of state-values. td_error = rewards[i] + gamma * values[i + 1].data - values[i].data gae = gae * gamma * tau + td_error # log_probs[i] is the log-probability(action-taken). If GAE is great, that # means the choice we had made was great, and we want to make the same # action decision in future -- make log_probs[i] large. Otherwise, # we add log_probs to our regret and will be less likely to take the same # action in future. # # entropy means the variety in a probabilistic distribution, # to encourage big entropies is to make more exploration. actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i]) optimizer.zero_grad( ) # Applied the gradient to the parameter (back-propagation will get you good stuff from gradient) total_loss = actor_loss + critic_loss * 0.5 # type: Variable total_loss.backward() # error occur, back propagation # this is to improve stability torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm) ensure_shared_grads( model, shared_model) # Push each updated model to the shared model optimizer.step()
def test(shared_model, render=0): env = create_atari_env(args.rom) if render == 1: env.render() model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking episode_length = 0 cx = hx = None while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx))) prob = F.softmax(logit) # print logit.data.numpy() action = prob.max(1, keepdim=True)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0, 0]) if render: #env.render() # Spits out images in the selected path img = env.render('rgb_array') imsave( '/opt/tmp/img/pac-20000/frame_{:06d}.png'.format( episode_length), img) """ TEST-DEMO-ONLY state_im = state.numpy() state_im.transpose() scipy.misc.imageio.saveim(state_im, filename-with-time-step-number) #ffmpeg END-WORKZONE """ done = done or episode_length >= 10000 reward_sum += reward # a quick hack to prevent the agent from stucking # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # done = True if done: print("Time {}, episode reward {}, episode length {}".format( get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 state = env.reset() time.sleep(60) state = torch.from_numpy(state)
def get_env_ac_space(env_id): from envs import create_atari_env return create_atari_env(env_id).action_space.n
def train(rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() avg_rew_win_size = 25 avg_rew = 0 state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() avg_rew_cnt = 0 # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward_sum += reward reward = max(min(reward, 1), -1) # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True with lock: counter.value += 1 if done: avg_rew = avg_rew + reward_sum if avg_rew_cnt % avg_rew_win_size == 0: print(" avg. episode reward {}".format(avg_rew / avg_rew_win_size)) avg_rew = 0 print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) episode_length = 0 reward_sum = 0 actions.clear() state = env.reset() avg_rew_cnt = avg_rew_cnt + 1 state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model, counter, logger): console_f = logger.init_console_log_file() torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.eval() max_score = 0 start_time = time.time() while True: if args.max_counter_num != 0 and counter.value > args.max_counter_num: if args.save_policy_models: logger.save_policy_model(shared_model, counter.value + 1) exit(0) # monitor counter value if counter.value % args.testing_every_counter > 1: continue counter_value = counter.value model.load_state_dict(shared_model.state_dict()) if args.save_policy_models: if counter_value % args.save_policy_models_every <= 5: logger.save_policy_model(shared_model, counter_value) state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) # actions = deque(maxlen=500) actions = deque(maxlen=1000) episode_length = 0 episode_count = 0 episode_rewards_sum = 0 episode_length_sum = 0 while True: episode_length += 1 # Sync with the shared model with torch.no_grad(): if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit, dim=1) action = prob.max(1, keepdim=True)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: episode_count += 1 episode_rewards_sum += reward_sum episode_length_sum += episode_length if episode_count == args.testing_episodes_num: print("Time {}, num steps {}, FPS {:.0f}, avg episode reward {}, avg episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter_value, counter_value / (time.time() - start_time), episode_rewards_sum/args.testing_episodes_num, episode_length_sum/args.testing_episodes_num)) logger.write_results_log(console_f, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter_value, counter_value / (time.time() - start_time), episode_rewards_sum / args.testing_episodes_num, episode_length_sum / args.testing_episodes_num) if args.save_max and (episode_rewards_sum / args.testing_episodes_num) >= max_score: max_score = episode_rewards_sum / args.testing_episodes_num logger.save_policy_model(shared_model, count="max_reward") break reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def train(rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.gae_lambda + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
class Params(): def __init__(self): self.lr = 0.0001 self.gamma = 0.99 self.tau = 1. self.seed = 1 self.num_processes = 16 self.num_steps = 20 self.max_episode_length = 10000 self.env_name = 'Breakout-v0' # Main run os.environ['OMP_NUM_THREADS'] = '1' # 1 thread per core params = Params() # creating the params object from the Params class, that sets all the model parameters torch.manual_seed(params.seed) # setting the seed (not essential) env = create_atari_env(params.env_name) # we create an optimized environment thanks to universe # shared_model is the model shared by the different agents (different threads in different cores) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() # storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory # even if they are in different cores # the optimizer is also shared because it acts on the shared model optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr) # same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory # to optimize the model optimizer.share_memory() processes = [] # initializing the processes with an empty list p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) # allowing to create the 'test' process # with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread p.start() # starting the created process p processes.append(p) # adding the created process p to the list of processes
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) env = create_atari_env(params.env_name) #getting the environment env.seed(params.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length+=1 model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1,256)) hx = Variable(torch.zeros(1,256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(params.num_steps): value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(action_values) log_prob = F.log_softmax(action_values) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) values.append(value) log_probs.append(log_prob) state, reward, done = env.step(action.numpy()) done = (done or episode_length >= params.max_episode_length) reward = max(min(reward,1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) rewards.append(reward) if done: break R = torch.zeros(1,1) if not done: value, _, _ = model.((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1,1) for i in reversed(range(len(rewards))): R = params.gamma*R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) TD = rewards[i] + params.gamma * values[i+1].data - values[i].data gae = gae * params.gamma * params.tau + TD policy_loss = policy_loss - log_probs[i]*Variable(gae) - 0.01*entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def train( rank, args, shared_model, shared_curiosity, counter, lock, pids, optimizer, train_policy_losses, train_value_losses, train_rewards ): pids.append(os.getpid()) torch.manual_seed(args.seed + rank) if args.game == 'doom': env = create_doom_env( args.env_name, rank, num_skip=args.num_skip, num_stack=args.num_stack) elif args.game == 'atari': env = create_atari_env(args.env_name) elif args.game == 'picolmaze': env = create_picolmaze_env(args.num_rooms) env.seed(args.seed + rank) model = ActorCritic( # env.observation_space.shape[0], args.num_stack, env.action_space) curiosity = IntrinsicCuriosityModule( # ICM # env.observation_space.shape[0], args.num_stack, env.action_space) if optimizer is None: # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) optimizer = optim.Adam( # ICM chain(shared_model.parameters(), shared_curiosity.parameters()), lr=args.lr) model.train() curiosity.train() # ICM state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 killer = Killer() while not killer.kill_now: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) curiosity.load_state_dict(shared_curiosity.state_dict()) # ICM if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] inv_loss = torch.tensor(0.0) # ICM forw_loss = torch.tensor(0.0) # ICM for step in range(args.num_steps): if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) episode_length += 1 value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) # Entropy trick if 'sparse' in args.env_name.lower(): max_entropy = torch.log( torch.tensor(logit.size()[1], dtype=torch.float)) entropy = entropy \ if entropy <= args.max_entropy_coef * max_entropy \ else torch.tensor(0.0) entropies.append(entropy) action = prob.multinomial(num_samples=1).flatten().detach() log_prob = log_prob.gather(1, action.view(1, 1)) state_old = state # ICM state, external_reward, done, _ = env.step(action) state = torch.from_numpy(state) # external reward = 0 if ICM-only mode external_reward = external_reward * (1 - args.icm_only) # <---ICM--- inv_out, forw_out, curiosity_reward = \ curiosity( state_old.unsqueeze(0), action, state.unsqueeze(0)) # In noreward-rl: # self.invloss = tf.reduce_mean( # tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex), # name="invloss") # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss') # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it. current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action) current_forw_loss = curiosity_reward inv_loss += current_inv_loss forw_loss += current_forw_loss curiosity_reward = args.eta * curiosity_reward reward = max(min(external_reward, args.clip), -args.clip) + \ max(min(curiosity_reward.detach(), args.clip), -args.clip) # ---ICM---> done = done or episode_length >= args.max_episode_length with lock: counter.value += 1 values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break train_rewards[rank - 1] = sum(rewards) # <---ICM--- inv_loss = inv_loss / episode_length forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length curiosity_loss = args.lambda_1 * ( (1 - args.beta) * inv_loss + args.beta * forw_loss) # ---ICM---> R = torch.zeros(1, 1) if not done: value, _, _ = model(state.unsqueeze(0), hx, cx) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.gae_lambda + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() train_policy_losses[rank - 1] = float((policy_loss).detach().item()) train_value_losses[rank - 1] = float((value_loss).detach().item()) (policy_loss + args.value_loss_coef * value_loss + curiosity_loss).backward() # ICM torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) torch.nn.utils.clip_grad_norm_(curiosity.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) ensure_shared_grads(curiosity, shared_curiosity) optimizer.step() env.close()
def train(rank, args, shared_model, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.lstm_size) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() #actions=deque(maxlen=100) episode_length = 0 currentPath = os.getcwd() File = open(currentPath + '/record.txt', 'a+') print("\n\n\n\n------------------------------\n\n\n\n\n") File.write("\n\n\n\n------------------------------\n\n\n\n\n") File.close() cnt = 0 episode_number = 0 while True: env.render() cnt = cnt + 1 episode_length += 1 if done: model.load_state_dict(shared_model.state_dict()) hx = Variable(torch.zeros(1, args.lstm_size), volatile=True) cx = Variable(torch.zeros(1, args.lstm_size), volatile=True) else: hx = Variable(hx.data, volatile=True) cx = Variable(cx.data, volatile=True) #print(state) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) #action=prob.max(1)[1].data.numpy() action = prob.multinomial().data #if(args.env_name=='Breakout-v3'): # state,reward,done,_=env.step(1) # reward_sum+=reward #state,reward,done,_ =env.step(action[0,0]) state, reward, done, _ = env.step(action.numpy()) done = done #or episode_length >= args.max_episode_length if episode_length >= args.max_episode_length: done = True reward_sum -= 30 reward_sum += reward #actions.append(action[0,0]) #if actions.count(actions[0])==actions.maxlen: # done=True #if reward!=0: # print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!') if done: hour = int( time.strftime("%H", time.gmtime(time.time() - start_time))) _min = int( time.strftime("%M", time.gmtime(time.time() - start_time))) print("Time {},episode reward {}, episode length {} ".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File = open(currentPath + '/record.txt', 'a+') File.write( "Time {},episode reward {}, episode length {} \n".format( hour * 60 + _min + args.starttime, reward_sum, episode_length)) File.close() reward_sum = 0 episode_length = 0 #actions.clear() state = env.reset() torch.save(model.state_dict(), currentPath + '/A3C.t7') episode_number += 1 time.sleep(60) state = torch.from_numpy(state)