def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = ActorCritic(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=0.001) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) net.train() running_score = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: if args.render: env.render() policy, value = net(state) action = get_action(policy, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 transition = [state, next_state, action, reward, mask] train_model(net, optimizer, transition, policy, value) score += reward state = next_state score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % args.log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(score), running_score) if running_score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def run(args): device = torch.device("cpu") env = gym.make('SpaceInvaders-v0') state_size = env.observation_space.shape action_size = env.action_space.n model = ActorCritic([1, 4, 84, 84], action_size).to(device) opt = SharedRMSprop(model.parameters(), lr=args.lr, alpha=args.alpha, eps=1e-8, weight_decay=args.weight_decay, momentum=args.momentum, centered=False) opt_lock = mp.Lock() scheduler = LRScheduler(args) if args.load_fp: checkpoint = torch.load(args.load_fp) model.load_state_dict(checkpoint['model_state_dict']) opt.load_state_dict(checkpoint['optimizer_state_dict']) if args.train: start = time.time() model.share_memory() model.train() step_counter, max_reward, ma_reward, ma_loss = [ mp.Value('d', 0.0) for _ in range(4) ] processes = [] if args.num_procs == -1: args.num_procs = mp.cpu_count() for rank in range(args.num_procs): p = mp.Process(target=train, args=(rank, args, device, model, opt, opt_lock, scheduler, step_counter, max_reward, ma_reward, ma_loss)) p.start() processes.append(p) for p in processes: p.join() if args.verbose > 0: print(f"Seconds taken: {time.time() - start}") if args.save_fp: torch.save( { 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': opt.state_dict(), }, args.save_fp) if args.test: model.eval() test(args, device, model)
def coordinator(rank, args, share_model, exp_queues, model_params): assert len(exp_queues) == args.num_processes # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print(device) model = ActorCritic() model.train() # model.load_state_dict(share_model.state_dict()) for i in range(args.num_processes): model_params[i].put(model.state_dict()) # if args.cuda: # model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5) entropy_coef = args.entropy_coef count = 0 while True: count += 1 if count >= 14000: entropy_coef = 1 if count >= 17000: entropy_coef = 0.5 if count >= 19000: entropy_coef = 0.1 # assemble experiences from the agents for i in range(args.num_processes): s_batch, a_batch, r_batch, done = exp_queues[i].get() loss = compute_loss(args, s_batch, a_batch, r_batch, done, model, entropy_coef) optimizer.zero_grad() loss.backward(retain_graph=True) if torch.isnan(loss): torch.save(s_batch, 's_batch-coor.pt') torch.save(loss, 'loss.pt') print('s_batch', s_batch) print('loss: ', loss) break torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # for param in model.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step() print('update model parameters ', count) if torch.isnan(loss): break # model.zero_grad() # if args.cuda: # model = model.cpu() for i in range(args.num_processes): model_params[i].put(model.state_dict()) share_model.load_state_dict(model.state_dict())
def main(): # try: parse_cmd_args() sess = tf.Session() K.set_session(sess) db = Database() env = Environment(db, argus) actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'], size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem']) num_trials = argus['num_trial'] # ? # trial_len = 500 # ? # ntp env.preheat() # First iteration cur_state = env._get_obs() # np.array (inner_metric + sql) cur_state = cur_state.reshape((1, env.state.shape[0])) # action = env.action_space.sample() action = env.fetch_action() # np.array action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape-") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state for i in range(num_trials): # env.render() cur_state = cur_state.reshape((1, env.state.shape[0])) action, isPredicted = actor_critic.act(cur_state) print(action) action_2 = action.reshape((1, env.action_space.shape[0])) # for memory # action.tolist() # to execute new_state, reward, done, _ = env.step(action, isPredicted, i + 1) new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("%d-shape-" % i) print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() cur_state = new_state '''
def train(): memory = [] Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "next_state", "next_action"]) model = ActorCritic(flags.n_actions, flags.n_features, flags.lr_C, flags.lr_A, flags.gamma, empty_goal_action) loss_his = [] entropy_his = [] reward_his = [] for ii in range(flags.max_epoch): state = env.reset() init_state = state.copy() reward_all = 0 done = False steps = 0 loss = 0 t_start = time.time() action = model.choose_action(state) while not done: next_state, reward, done, _ = env.step(action) next_action = model.choose_action(next_state) reward_all += reward steps += 1 if len(memory) > flags.memory_size: memory.pop(0) memory.append( Transition(state, action, reward, next_state, next_action)) state = next_state action = next_action if len(memory) > flags.batch_size: batch_transition = random.sample(memory, flags.batch_size) batch_state, batch_action, batch_reward, batch_next_state, batch_next_action = map( np.array, zip(*batch_transition)) loss, _ = model.train(state=batch_state, action=batch_action, reward=batch_reward, state_=batch_next_state, action_=batch_next_action) entropy = model.compute_entropy(init_state) if loss != 0: loss_his.append(loss) entropy_his.append(entropy) reward_his.append(reward_all) print("epoch=", ii, "/time=", time.time() - t_start, "/loss=", loss, "/entropy=", entropy, "/reward=", reward_all) return loss_his, entropy_his, reward_his
def train_curiosity(rank, args, shared_model, shared_curiosity, counter, lock, pids, optimizer): pids.append(os.getpid()) torch.manual_seed(args.seed + rank) if args.game == 'doom': env = create_doom_env(args.env_name, rank, num_skip=args.num_skip, num_stack=args.num_stack) elif args.game == 'atari': env = create_atari_env(args.env_name) elif args.game == 'picolmaze': env = create_picolmaze_env(args.num_rooms) env.seed(args.seed + rank) model = ActorCritic( # env.observation_space.shape[0], args.num_stack, env.action_space) curiosity = IntrinsicCuriosityModule( # ICM # env.observation_space.shape[0], args.num_stack, env.action_space) if optimizer is None: # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) optimizer = optim.Adam( # ICM chain(shared_model.parameters(), shared_curiosity.parameters()), lr=args.lr) model.train() curiosity.train() # ICM model.load_state_dict(shared_model.state_dict()) state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 killer = Killer() while not killer.kill_now: # Sync with the shared model curiosity.load_state_dict(shared_curiosity.state_dict()) # ICM if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() inv_loss = torch.tensor(0.0) # ICM forw_loss = torch.tensor(0.0) # ICM for step in range(args.num_steps): if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) episode_length += 1 value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx) prob = F.softmax(logit, dim=-1) action = prob.multinomial(num_samples=1).flatten().detach() state_old = state # ICM state, external_reward, done, _ = env.step(action) state = torch.from_numpy(state) # external reward = 0 if ICM-only mode external_reward = external_reward * (1 - args.icm_only) # <---ICM--- inv_out, forw_out, curiosity_reward = \ curiosity( state_old.unsqueeze(0), action, state.unsqueeze(0)) # In noreward-rl: # self.invloss = tf.reduce_mean( # tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex), # name="invloss") # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss') # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it. current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action) current_forw_loss = curiosity_reward inv_loss += current_inv_loss forw_loss += current_forw_loss # ---ICM---> done = done or episode_length >= args.max_episode_length with lock: counter.value += 1 if done: break # <---ICM--- inv_loss = inv_loss / episode_length forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length curiosity_loss = args.lambda_1 * ( (1 - args.beta) * inv_loss + args.beta * forw_loss) # ---ICM---> optimizer.zero_grad() curiosity_loss.backward() # ICM torch.nn.utils.clip_grad_norm_(curiosity.parameters(), args.max_grad_norm) ensure_shared_grads(curiosity, shared_curiosity) optimizer.step() env.close()
env.preheat() # First iteration cur_state = env._get_obs() # np.array (inner_metric + sql) cur_state = cur_state.reshape((1, env.state.shape[0])) # action = env.action_space.sample() action = env.fetch_action() # np.array action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, socre, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state predicted_rewardList = [] for epoch in range(num_trials): # env.render() cur_state = cur_state.reshape((1, env.state.shape[0])) action, isPredicted = actor_critic.act(cur_state) print(action) action_2 = action.reshape((1, env.action_space.shape[0])) # for memory # action.tolist() # to execute new_state, reward, done, score, _ = env.step(action, isPredicted, epoch + 1) new_state = new_state.reshape((1, env.state.shape[0])) if isPredicted == 1: predicted_rewardList.append([epoch, reward])
def train(rank, args, shared_model, counter, lock, optimizer=None, DEBUG=False): if DEBUG: print('rank: {}'.format(rank)) torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) model.train() if DEBUG: print('agent{:03d}: model created'.format(rank)) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) if DEBUG: print('agent{:03d}: optimizer created'.format(rank)) state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: if DEBUG: print('agent{:03d}: while loop'.format(rank)) # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): if DEBUG: print('agent{:03d}: for loop p1'.format(rank)) episode_length += 1 if DEBUG: print('agent{:03d}: for loop p1.1'.format(rank)) print(state.unsqueeze(0).size()) with lock: value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) if DEBUG: print('agent{:03d}: for loop p2'.format(rank)) # prob = F.softmax(logit) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) if DEBUG: print('agent{:03d}: for loop p3'.format(rank)) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) if DEBUG: print('agent{:03d}: for loop p4'.format(rank)) action = prob.multinomial(num_samples=1).data log_prob = log_prob.gather(1, Variable(action)) if DEBUG: print('agent{:03d}: for loop p5'.format(rank)) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) if DEBUG: print('agent{:03d}: for loop p6'.format(rank)) with lock: counter.value += 1 if DEBUG: print('agent{:03d}: counter plus {:09d}'.format( rank, counter.value)) if DEBUG: print('agent{:03d}: for loop p7'.format(rank)) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
while True: if episode_length % steps == 0: model.low_lr(rate) if (episode_length % 1000 == 0) and (episode_length > 20000): if dataset == 'cifar': model.eval() map = test_util.test(Dtest, model, batch_size, bit_len) file = open(logpath, "a") file.write('#### map=' + str(map) + '\n') file.close() path = checkpoint_path + '/' + str(episode_length) + '.model' torch.save(model.state_dict(), path) model.train() if dataset == 'cifar': ori, pos, neg = traintest.get_batch_cifar_nus(batch_size) else: ori, pos, neg = traintest.get_batch_flk_nus(batch_size) ori = Variable(ori).cuda() pos = Variable(pos).cuda() neg = Variable(neg).cuda() hash_o = Variable(torch.zeros(batch_size, 1).cuda()) hash_p = Variable(torch.zeros(batch_size, 1).cuda()) hash_n = Variable(torch.zeros(batch_size, 1).cuda()) probs_o = model(ori) probs_p = model(pos)
def train(rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() avg_rew_win_size = 25 avg_rew = 0 state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() avg_rew_cnt = 0 # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward_sum += reward reward = max(min(reward, 1), -1) # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True with lock: counter.value += 1 if done: avg_rew = avg_rew + reward_sum if avg_rew_cnt % avg_rew_win_size == 0: print(" avg. episode reward {}".format(avg_rew / avg_rew_win_size)) avg_rew = 0 print("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)) episode_length = 0 reward_sum = 0 actions.clear() state = env.reset() avg_rew_cnt = avg_rew_cnt + 1 state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) backend, env_name = args.env_name.split(':') if backend == 'unity3d': os.chdir('/mnt/code/') env = create_unity3d_env(train_mode=False,\ file_name=os.path.join(UNITYFOLDER, env_name), \ worker_id=rank, seed=args.seed + rank, \ docker_training=True) elif backend == 'gym': env = create_atari_env(env_name) env.seed(args.seed + rank) else: print(f' [!]: {backend} is not a valid backend') raise ValueError model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state).float() done = True episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state).float() values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.gae_lambda + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, params, shared_model, optimizer): torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent # creating an optimized environment thanks to the create_atari_env function env = create_expansionai_env(params.env_name, video=True, params=params) env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent # creating the model from the ActorCritic class model = ActorCritic(env.observation_space.shape[0], env.action_space, params) model.train() state = env.reset() # state is a numpy array of size 1*42*42, in black & white logger.debug("Current training state {}".format(state)) state = torch.from_numpy(state) # converting the numpy array into a torch tensor done = True # when the game is done episode_length = 0 # initializing the length of an episode to 0 while True: # repeat state = state.float() # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps model.load_state_dict(shared_model.state_dict()) if done: # if it is the first iteration of the while loop or if the game was just done, then: cx = Variable(torch.zeros(1, params.lstm_size)) # the cell states of the LSTM are reinitialized to zero hx = Variable(torch.zeros(1, params.lstm_size)) # the hidden states of the LSTM are reinitialized to zero else: # else: cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable values = [] # initializing the list of values (V(S)) log_probs = [] # initializing the list of log probabilities rewards = [] # initializing the list of rewards entropies = [] # initializing the list of entropies for step in range(params.num_steps): # going through the num_steps exploration steps episode_length += 1 # incrementing the episode length by one # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b))) prob = F.softmax(action_value) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a)) log_prob = F.log_softmax(action_value) entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x)) entropies.append(entropy) # storing the computed entropy # selecting an actions by taking a random draw from the prob distribution action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action # playing the selected action, reaching the new state, and getting the new reward action_to_take = action.numpy() state, reward, done, _ = env.step(action_to_take) # if the episode lasts too long (the agent is stucked), then it is done done = (done or episode_length >= params.max_episode_length) reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1 logger.debug( "Train action {} brought reward {} should we done {} after step {} in episode {} with state \n{}".format( action_to_take, reward, done, step, episode_length, state[1])) if episode_length % 100 == 0: logger.info( "Train episode {} and current rewards {} with armies {} occupied cells {} and movable cells {}".format( episode_length, rewards, env.unwrapped.armies, env.unwrapped.occupied_cells_num, env.unwrapped.movable_cells_num )) if done: # if the episode is done: episode_length = 0 # we restart the environment prev_state = state state = env.reset() # we restart the environment logger.info( "Train episode reward {}, episode length {} steps {}".format(reward, episode_length, step)) state = torch.from_numpy(state).float() # tensorizing the new state values.append(value) # storing the value V(S) of the state log_probs.append(log_prob) # storing the log prob of the action rewards.append(reward) # storing the new observed reward if done: # if we are done # we stop the exploration and we directly move on to the next step: the update of the shared model break R = torch.zeros(1, 1) # initializing the cumulative reward if not done: # if we are not done: # we initialize the cumulative reward with the value of the last shared state value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data # we initialize the cumulative reward with the value of the last shared state values.append(Variable(R)) # storing the value V(S) of the last reached state S policy_loss = 0 # initializing the policy loss value_loss = 0 # initializing the value loss R = Variable(R) # making sure the cumulative reward R is a torch Variable gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0 for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state) R = params.gamma * R + rewards[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss # computing the temporal difference delta_t = rewards[i] + params.gamma * values[i + 1].data - values[i].data # gae = sum_i (gamma*tau)^i * delta_t(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i)) gae = gae * params.gamma * params.tau + delta_t policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss optimizer.zero_grad() # initializing the optimizer # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller # print("= Train losses \npolicy_loss {} \n value_loss {}\n".format(policy_loss, value_loss)) torch.autograd.backward([policy_loss + 0.5 * value_loss], [torch.FloatTensor([[1, 0]])], retain_graph=True) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm torch.nn.utils.clip_grad_norm(model.parameters(), 40) # making sure the model of the agent and the shared model share the same gradient ensure_shared_grads(model, shared_model) optimizer.step() # running the optimization step
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) action_size = env.action_space.n model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.train() if not args.on_policy: memory = EpisodicReplayMemory(args.memory_capacity, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable( torch.zeros( 1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = 0, 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values input = extend_input(state, action_to_one_hot(action, action_size), reward) policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(input), (avg_hx, avg_cx)) # Sample action action = policy.multinomial().data[ 0, 0] # Graph broken as loss for stochastic action calculated manually # Step next_state, reward, done, _ = env.step(action) next_state = state_to_tensor(next_state) reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(input, action, reward, policy.data) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies ), (policy, Q, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]])), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = Variable(torch.zeros(1, 1)) if not args.on_policy: # Save terminal state for offline training memory.append( extend_input(state, action_to_one_hot(action, action_size), reward), None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(Variable(input), (hx, cx)) Qret = Qret.detach() # Train the network on-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state hx, avg_hx = Variable( torch.zeros(args.batch_size, args.hidden_size)), Variable( torch.zeros(args.batch_size, args.hidden_size)) cx, avg_cx = Variable( torch.zeros(args.batch_size, args.hidden_size)), Variable( torch.zeros(args.batch_size, args.hidden_size)) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition input = torch.cat((trajectory.state for trajectory in trajectories[i]), 0) action = Variable( torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ])).unsqueeze(1) reward = Variable( torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ])).unsqueeze(1) old_policy = Variable( torch.cat((trajectory.policy for trajectory in trajectories[i]), 0)) # Calculate policy and values policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( Variable(input), (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_input = torch.cat( (trajectory.state for trajectory in trajectories[i + 1]), 0) done = Variable( torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1)) # Do forward pass for all transitions _, _, Qret, _ = model(Variable(next_input), (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach() # Train the network off-policy _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
def main(): # 确定神经网络计算设备 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 构建神经网络 net = ActorCritic() net = net.to(device) # 准备优化器 optimizer = torch.optim.Adam(net.parameters(), lr=3e-4) # 准备环境 envs = Envs(NUM_WORKERS, gamma=GAMMA) # 开始训练 for episode in range(EPISODES): # 从多个环境采集一回合数据 net.eval() with torch.no_grad(): states = envs.reset() done = False while not done: states = states.to(device) _, policys = net(states) policys = policys.cpu() # 移到CPU上处理比较好 # 不能下的位置概率填 0 for i in range(NUM_WORKERS): if envs.reversis[i].next != 0: for y, x in itertools.product(range(SIZE), repeat=2): if not envs.reversis[i].good[y][x]: policys[i][y * SIZE + x] = 0. else: policys[i][y * SIZE + x] += 1e-8 # 防止概率全为 0 actions = Categorical(probs=policys).sample() done, states = envs.step(actions) envs.setReturn() data = EpisodeData(envs.readHistory()) loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) # 训练网络 net.train() # 相关指标 value_loss_total = 0. entropy_total = 0. for states, actions, Returns in loader: states, actions, Returns = states.to(device), actions.to( device), Returns.to(device) values, policys = net(states) dist = Categorical(probs=policys) action_log_probs = dist.log_prob(actions).view(-1, 1) dist_entropy = dist.entropy().mean() # 我们希望分布的熵更大些,保持模型的探索性 advantages = Returns.view(-1, 1) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() optimizer.zero_grad() (VALUE_LOSS_COEF * value_loss + action_loss - ENTROPY_LOSS_COEF * dist_entropy).backward() optimizer.step() value_loss_total += value_loss.item() entropy_total += dist_entropy.item() print('Episode: {:>10d}, Value Loss: {:g}, Entropy: {:g}'.format( episode, value_loss_total / len(loader), entropy_total / len(loader)), flush=True) if episode != 0 and episode % SAVE_INTERVAL == 0: if not os.path.isdir('models'): os.mkdir('models') torch.save(net.state_dict(), 'models/{}.pt'.format(episode // SAVE_INTERVAL))
def train(rank, args, shared_model, counter, lock, logger, optimizer=None): if args.save_sigmas: sigmas_f = logger.init_one_sigmas_file(rank) torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) if args.add_rank_reg: if args.rank_reg_type == "maxdividemin": rank_reg = MaxDivideMin.apply elif args.rank_reg_type == "maxminusmin": rank_reg = MaxMinusMin.apply model.train() state = env.reset() state = torch.from_numpy(state) done = True local_counter = 0 episode_length = 0 while True: if args.max_counter_num != 0 and counter.value > args.max_counter_num: exit(0) # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] if args.add_rank_reg: hiddens = [None] * 2 # 0: last layer, 1: last last layer for step in range(args.num_steps): episode_length += 1 model_inputs = (Variable(state.unsqueeze(0)), (hx, cx)) if args.add_rank_reg: value, logit, (hx, cx), internal_features = model(model_inputs, return_features=True) else: value, logit, (hx, cx) = model(model_inputs) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) if args.add_rank_reg: if hiddens[0] is None: hiddens[0] = internal_features[-1] hiddens[1] = internal_features[-2] else: hiddens[0] = torch.cat([hiddens[0], internal_features[-1]]) hiddens[1] = torch.cat([hiddens[1], internal_features[-2]]) action = prob.multinomial(num_samples=1).data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) local_counter += 1 with lock: if local_counter % 20 == 0: counter.value += 20 if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = args.gamma * values[i + 1].data - values[i].data + rewards[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] total_loss = policy_loss + args.value_loss_coef * value_loss # internal layers regularizer retain_graph = None if args.add_rank_reg: current_rankreg_coef = args.rank_reg_coef # total_loss = total_loss + rank_reg(hiddens[0], args.rank_reg_coef) if args.save_sigmas and local_counter % args.save_sigmas_every <= 3: norm = rank_reg(hiddens[0], current_rankreg_coef, counter.value, sigmas_f, logger) else: norm = rank_reg(hiddens[0], current_rankreg_coef) total_loss = total_loss + norm optimizer.zero_grad() total_loss.backward(retain_graph=retain_graph) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, shared_model, counter, lock, optimizer=None, select_sample=True): FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if args.use_cuda else torch.LongTensor env = setup_env(args.env_name) model = ActorCritic(1, env.action_space.n) if args.use_cuda: model.cuda() if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = prepro(env.reset()) state = torch.from_numpy(state) done = True episode_length = 0 for num_iter in count(): if rank == 0: if num_iter % args.save_interval == 0 and num_iter > 0: #print ("Saving model at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path) if num_iter % ( args.save_interval * 2.5 ) == 0 and num_iter > 0 and rank == 1: # Second saver in-case first processes crashes #print ("Saving model for process 1 at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path) model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 512)).type(FloatTensor) hx = Variable(torch.zeros(1, 512)).type(FloatTensor) else: cx = Variable(cx.data).type(FloatTensor) hx = Variable(hx.data).type(FloatTensor) values, log_probs, rewards, entropies = [], [], [], [] actions, forwards, vec_st1s, inverses = [], [], [], [] for step in range(args.num_steps): episode_length += 1 state_inp = Variable(state.unsqueeze(0)).type(FloatTensor) value, logit, (hx, cx) = model((state_inp, (hx, cx)), False) s_t = state prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(-1, keepdim=True) entropies.append(entropy) if select_sample: action = prob.multinomial(num_samples=1).data else: action = prob.max(-1, keepdim=True)[1].data log_prob = log_prob.gather(-1, Variable(action)) action_out = action.to(torch.device("cpu")) oh_action = torch.Tensor(1, env.action_space.n).type(LongTensor) oh_action.zero_() oh_action.scatter_(1, action, 1) a_t = oh_action.type(FloatTensor) #print ('action', a_t) state, reward, done, _ = env.step(action_out.numpy()[0][0]) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) #print ('extrinsic reward', reward) state = torch.from_numpy(prepro(state)) s_t1 = state vec_st1, inverse, forward = model( (Variable(s_t.unsqueeze(0)).type(FloatTensor), Variable(s_t1.unsqueeze(0)).type(FloatTensor), a_t), True) reward_intrinsic = args.eta * ( (vec_st1 - forward).pow(2)).sum(1) / 2. reward_intrinsic = reward_intrinsic.to(torch.device("cpu")) #print('intrinsic reward', reward_intrinsic) reward += reward_intrinsic reward1 = reward_intrinsic #print('total_reward', reward) with lock: counter.value += 1 if done: episode_length = 0 state = torch.from_numpy(prepro(env.reset())) values.append(value) log_probs.append(log_prob) reward1 = reward1.type(FloatTensor) rewards.append(reward1) forwards.append(forward) vec_st1s.append(vec_st1) inverses.append(inverse) actions.append(a_t) if done: break R = torch.zeros(1, 1) if not done: state_inp = Variable(state.unsqueeze(0)).type(FloatTensor) value, _, _ = model((state_inp, (hx, cx)), False) R = value.data values.append(Variable(R).type(FloatTensor)) policy_loss = 0 value_loss = 0 forward_loss = 0 inverse_loss = 0 R = Variable(R).type(FloatTensor) gae = torch.zeros(1, 1).type(FloatTensor) #print (rewards) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae).type(FloatTensor) - args.entropy_coef * entropies[i] forward_err = forwards[i] - vec_st1s[i] forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1) cross_entropy = -(actions[i] * torch.log(inverses[i] + 1e-15)).sum(1) inverse_loss = inverse_loss + cross_entropy #print ('forward loss', forward_loss) #print ('inverse loss', inverse_loss) #print ('other loss', (policy_loss + args.value_loss_coef * value_loss)) optimizer.zero_grad() ((1 - args.beta) * inverse_loss + args.beta * forward_loss).backward(retain_graph=True) (args.lmbda * (policy_loss + 0.5 * value_loss)).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, T, shared_model, optimiser): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) action_size = env.action_space.n model = ActorCritic(env.observation_space, env.action_space, args.hidden_size, args.no_noise, args.noise_entropy) model.train() t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # Sync with shared model at least every t_max steps model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: hx = Variable(torch.zeros(1, args.hidden_size)) cx = Variable(torch.zeros(1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) action, reward, done, episode_length = 0, 0, False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() model.sample_noise( ) # Pick a new noise vector (until next optimisation step) # Lists of outputs for training values, log_probs, rewards, entropies = [], [], [], [] while not done and t - t_start < args.t_max: input = extend_input(state, action_to_one_hot(action, action_size), reward, episode_length) # Calculate policy and value policy, value, (hx, cx) = model(Variable(input), (hx, cx)) log_policy = policy.log() entropy = -(log_policy * policy).sum(1) # Sample action action = policy.multinomial() log_prob = log_policy.gather( 1, action.detach() ) # Graph broken as loss for stochastic action calculated manually action = action.data[0, 0] # Step state, reward, done, _ = env.step(action) state = state_to_tensor(state) reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length episode_length += 1 # Increase episode counter # Save outputs for training [ arr.append(el) for arr, el in zip((values, log_probs, rewards, entropies), (value, log_prob, reward, entropy)) ] # Increment counters t += 1 T.increment() # Return R = 0 for terminal s or V(s_i; θ) for non-terminal s if done: R = Variable(torch.zeros(1, 1)) else: _, R, _ = model(Variable(input), (hx, cx)) R = R.detach() values.append(R) # Train the network policy_loss = 0 value_loss = 0 A_GAE = torch.zeros(1, 1) # Generalised advantage estimator Ψ # Calculate n-step returns in forward view, stepping backwards from the last state trajectory_length = len(rewards) for i in reversed(range(trajectory_length)): # R ← r_i + γR R = rewards[i] + args.discount * R # Advantage A = R - V(s_i; θ) A = R - values[i] # dθ ← dθ - ∂A^2/∂θ value_loss += 0.5 * A**2 # Least squares error # TD residual δ = r + γV(s_i+1; θ) - V(s_i; θ) td_error = rewards[i] + args.discount * values[ i + 1].data - values[i].data # Generalised advantage estimator Ψ (roughly of form ∑(γλ)^t∙δ) A_GAE = A_GAE * args.discount * args.trace_decay + td_error # dθ ← dθ + ∇θ∙log(π(a_i|s_i; θ))∙Ψ policy_loss -= log_probs[i] * Variable( A_GAE) # Policy gradient loss if args.no_noise or args.noise_entropy: # dθ ← dθ + β∙∇θH(π(s_i; θ)) policy_loss -= args.entropy_weight * entropies[ i] # Entropy maximisation loss # Optionally normalise loss by number of time steps if not args.no_time_normalisation: policy_loss /= trajectory_length value_loss /= trajectory_length # Zero shared and local grads optimiser.zero_grad() # Note that losses were defined as negatives of normal update rules for gradient descent (policy_loss + value_loss).backward() # Gradient L2 normalisation nn.utils.clip_grad_norm(model.parameters(), args.max_gradient_norm, 2) # Transfer gradients to shared model and update _transfer_grads_to_shared_model(model, shared_model) optimiser.step() if not args.no_lr_decay: # Linearly decay learning rate _adjust_learning_rate( optimiser, max(args.lr * (args.T_max - T.value()) / args.T_max, 1e-32)) env.close()
def train(pid, rank, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) filepath = "./train_model_" + str(rank) env = gym.wrappers.Monitor(env, filepath, force=True) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() obs = env.reset() state = torch.from_numpy(obs) # state = get_state(obs) done = True while True: # if parent process is killed by "kill -9", child process kill itself pps = psutil.Process(pid=pid) try: if pps.status() in (psutil.STATUS_DEAD, psutil.STATUS_STOPPED): break except psutil.NoSuchProcess: break # Sync with the shared model model.load_state_dict(shared_model.state_dict()) values = [] log_probs = [] rewards = [] entropies = [] if done: cx = torch.zeros(1, 512) hx = torch.zeros(1, 512) else: cx = cx.detach() hx = hx.detach() for step in range(args.num_steps): value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) # sampled from the multinomial probability distribution action = prob.multinomial(num_samples=1).detach() # [[1]] log_prob = log_prob.gather(1, action) obs, reward, done, _ = env.step(action.numpy()) # done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done # reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1 with lock: counter.value += 1 if done: obs = env.reset() state = torch.from_numpy(obs) entropies.append(entropy) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: # print("step {} done {}".format(step, done)) break # Gradient = ∇θ′logπ(at|st;θ′)[Rt−V(st;θv) + β∇θ′H(π(st;θ′)] # gae-lambda - 1.00 # entropy-coef - 0.01 # value-loss-coef - 0.5 # max-grad-norm - 40 # gamma - 0.99 R = torch.zeros(1, 1) # if done R=[[0]] if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) # Generalized Advantage Estimation for i in reversed(range(len(rewards))): # advantege = Q - V R = rewards[i] + args.gamma * R # n-step advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation td_error = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.gae_lambda + td_error policy_loss = policy_loss - log_probs[i] * gae.detach( ) - args.entropy_coef * entropies[i] optimizer.zero_grad() # if not work, change pytorch to 1.4.0 (policy_loss + args.value_loss_coef * value_loss).backward( ) # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm ) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, share_model, counter, lock): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) optimizer = optim.Adam(share_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.FloatTensor(state) done = True # reward_sum = 0 episode_length = 0 while True: model.load_state_dict(share_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) # print('reward', reward) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) # reward_sum += reward # print(reward) with lock: counter.value += 1 if done: episode_length = 0 state = env.reset() state = torch.FloatTensor(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: # print('rank: ', rank) # print('reward: ', reward_sum) # reward_sum = 0 break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, share_model) optimizer.step()
def train(rank, args, T, shared_model, optimiser): torch.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) model.train() t = 1 # Thread step counter epr, eploss, done = 0, 0, True # Start new episode while T.value() <= args.T_max: while True: model.load_state_dict(shared_model.state_dict()) # sync with shared model # Get starting timestep t_start = t policies, Vs, actions, rewards = [], [], [], [] # save values for computing gradientss # Reset or pass on hidden state if done: hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size)) cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size)) # Reset environment and done flag state = state_to_tensor(env.reset()) done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() while not done and t - t_start < args.t_max: # Calculate policy and values policy, V, (hx, cx) = model(Variable(state), (hx, cx)) # Sample action action = policy.multinomial().data[0, 0] # Step next_state, reward, done, _ = env.step(action) next_state = state_to_tensor(next_state) reward = args.reward_clip and min(max(reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter # Save outputs for online training [arr.append(el) for arr, el in zip((policies, Vs, actions, rewards), (policy, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]]))))] # Increment counters t += 1 T.increment() # Update state state = next_state if done: R = Variable(torch.zeros(1, 1)) else: # R = V(s_i; θ) for non-terminal s _, R, _ = model(Variable(state), (hx, cx)) R = R.detach() # Train the network on-policy p_loss, v_loss = _train(args, T, model, shared_model, optimiser, policies, Vs, actions, rewards, R) # Finish episode if done: break
def train(rank, args, shared_model, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model every iteration model.load_state_dict(shared_model.state_dict()) if done: # initialization cx = Variable(torch.zeros(1, 128)) hx = Variable(torch.zeros(1, 128)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): # for mujoco, env returns DoubleTensor value, mu, sigma_sq, (hx, cx) = model( (Variable(state.float().unsqueeze(0).float()), (hx, cx))) sigma_sq = F.softplus(sigma_sq) eps = torch.randn(mu.size()) # calculate the probability action = (mu + sigma_sq.sqrt() * Variable(eps)).data prob = normal(action, mu, sigma_sq) entropy = -0.5 * ( (sigma_sq + 2 * pi.expand_as(sigma_sq)).log() + 1) entropies.append(entropy) log_prob = prob.log() state, reward, done, _ = env.step(action.numpy()) # prevent stuck agents done = done or episode_length >= args.max_episode_length # reward shaping reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _, _ = model( (Variable(state.float().unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) # calculate the rewards from the terminal state for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion # convert the data into xxx.data will stop the gradient delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t # for Mujoco, entropy loss lower to 0.0001 policy_loss = policy_loss - (log_probs[i]*Variable(gae).expand_as(log_probs[i])).sum() \ - (0.0001*entropies[i]).sum() optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, shared_model, optimizer=None): torch.manual_seed(args.seed + rank) env = WrapEnv(args.env_name) model = ActorCritic(4, env.num_actions, args.num_skips) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = np.concatenate([state] * 4, axis=0) state = torch.from_numpy(state) done = True episode_length = 0 sum_reward = 0 for ep_counter in itertools.count(1): # Sync with the shared model model.load_state_dict(shared_model.state_dict()) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit = model(Variable(state.unsqueeze(0))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) action_np = action.numpy()[0][0] if action_np < model.n_real_acts: state_new, reward, done, info = env.step(action_np) state = np.append(state.numpy()[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) episode_length += 1 else: state = state.numpy() reward = 0. for _ in range(action_np - model.n_real_acts + 2): state_new, rew, done, info = env.step( 0) # instead of random perform NOOP=0 state = np.append(state[1:, :, :], state_new, axis=0) done = done or episode_length >= args.max_episode_length reward += rew episode_length += 1 if done: break reward = max(min(reward, 1), -1) sum_reward += reward if done: state = env.reset() state = np.concatenate([state] * 4, axis=0) print('ep len {}, sum rew {}'.format(episode_length, sum_reward)) episode_length = 0 sum_reward = 0 state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _ = model(Variable(state.unsqueeze(0))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) policy_loss = policy_loss - \ log_probs[i] * Variable(advantage.data) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40.) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, shared_model, counter, num_done, num_episode, arr, lock, optimizer=None): torch.manual_seed(args.seed + rank) env = gym.make('MountainCar-v0').unwrapped env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state).float() done = True episode_length = 0 reward = 0 gae_lambda = args.gae_lambda1 # while True: while counter.value < 120000000: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 4).float() hx = torch.zeros(1, 4).float() else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] # reward = 0 for step in range(args.num_steps): episode_length += 1 with lock: num_episode.value += 1 value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward_step, done, _ = env.step(action.numpy()[0, 0]) #if step % 100 == 0: #print(action) #print(state) #print(prob) # print (reward_step) # print(" ") done = done or episode_length >= args.max_episode_length # reward = max(min(reward, 1), -1) reward += reward_step * 0.01 # print(reward) with lock: counter.value += 1 if done: #print("Done") episode_length = 0 reward = 10000 state = env.reset() with lock: num_done.value += 1 state = torch.from_numpy(state).float() values.append(value) log_probs.append(log_prob) rewards.append(reward_step) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0), (hx, cx))) R = value.detach() if counter.value > 47000000: gae_lambda = args.gae_lambda2 # print ("Stage2") # print (gae_lambda) values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * gae_lambda + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def train( rank, args, shared_model, shared_curiosity, counter, lock, pids, optimizer, train_policy_losses, train_value_losses, train_rewards ): pids.append(os.getpid()) torch.manual_seed(args.seed + rank) if args.game == 'doom': env = create_doom_env( args.env_name, rank, num_skip=args.num_skip, num_stack=args.num_stack) elif args.game == 'atari': env = create_atari_env(args.env_name) elif args.game == 'picolmaze': env = create_picolmaze_env(args.num_rooms) env.seed(args.seed + rank) model = ActorCritic( # env.observation_space.shape[0], args.num_stack, env.action_space) curiosity = IntrinsicCuriosityModule( # ICM # env.observation_space.shape[0], args.num_stack, env.action_space) if optimizer is None: # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) optimizer = optim.Adam( # ICM chain(shared_model.parameters(), shared_curiosity.parameters()), lr=args.lr) model.train() curiosity.train() # ICM state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 killer = Killer() while not killer.kill_now: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) curiosity.load_state_dict(shared_curiosity.state_dict()) # ICM if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] inv_loss = torch.tensor(0.0) # ICM forw_loss = torch.tensor(0.0) # ICM for step in range(args.num_steps): if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) episode_length += 1 value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) # Entropy trick if 'sparse' in args.env_name.lower(): max_entropy = torch.log( torch.tensor(logit.size()[1], dtype=torch.float)) entropy = entropy \ if entropy <= args.max_entropy_coef * max_entropy \ else torch.tensor(0.0) entropies.append(entropy) action = prob.multinomial(num_samples=1).flatten().detach() log_prob = log_prob.gather(1, action.view(1, 1)) state_old = state # ICM state, external_reward, done, _ = env.step(action) state = torch.from_numpy(state) # external reward = 0 if ICM-only mode external_reward = external_reward * (1 - args.icm_only) # <---ICM--- inv_out, forw_out, curiosity_reward = \ curiosity( state_old.unsqueeze(0), action, state.unsqueeze(0)) # In noreward-rl: # self.invloss = tf.reduce_mean( # tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex), # name="invloss") # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss') # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it. current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action) current_forw_loss = curiosity_reward inv_loss += current_inv_loss forw_loss += current_forw_loss curiosity_reward = args.eta * curiosity_reward reward = max(min(external_reward, args.clip), -args.clip) + \ max(min(curiosity_reward.detach(), args.clip), -args.clip) # ---ICM---> done = done or episode_length >= args.max_episode_length with lock: counter.value += 1 values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break train_rewards[rank - 1] = sum(rewards) # <---ICM--- inv_loss = inv_loss / episode_length forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length curiosity_loss = args.lambda_1 * ( (1 - args.beta) * inv_loss + args.beta * forw_loss) # ---ICM---> R = torch.zeros(1, 1) if not done: value, _, _ = model(state.unsqueeze(0), hx, cx) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.gae_lambda + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() train_policy_losses[rank - 1] = float((policy_loss).detach().item()) train_value_losses[rank - 1] = float((value_loss).detach().item()) (policy_loss + args.value_loss_coef * value_loss + curiosity_loss).backward() # ICM torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) torch.nn.utils.clip_grad_norm_(curiosity.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) ensure_shared_grads(curiosity, shared_curiosity) optimizer.step() env.close()
def local_train(index, opt, global_model, optimizer, save=False): torch.manual_seed(123 + index) if save: start_time = timeit.default_timer() writer = SummaryWriter(opt.log_path) env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) local_model = ActorCritic(num_states, num_actions) local_model.train() state = torch.from_numpy(env.reset()) done = True curr_step = 0 curr_episode = 0 while True: if save: if curr_episode % opt.save_interval == 0 and curr_episode > 0: torch.save( global_model.state_dict(), f"{opt.saved_path}/a3c_super_mario_bros_{opt.world}_{opt.stage}" ) print(f"Now Process {index}. Episode {curr_episode}") curr_episode += 1 local_model.load_state_dict(global_model.state_dict()) if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() log_policies = [] values = [] rewards = [] entropies = [] for _ in range(opt.num_local_steps): curr_step += 1 logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) log_policy = F.log_softmax(logits, dim=1) entropy = -(policy * log_policy).sum(1, keepdim=True) m = Categorical(policy) action = m.sample().item() state, reward, done, _ = env.step(action) state = torch.from_numpy(state) if curr_step > opt.num_global_steps: done = True if done: curr_step = 0 state = torch.from_numpy(env.reset()) values.append(value) log_policies.append(log_policy[0, action]) rewards.append(reward) entropies.append(entropy) if done: break R = torch.zeros((1, 1), dtype=torch.float) if not done: _, R, _, _ = local_model(state, h_0, c_0) gae = torch.zeros((1, 1), dtype=torch.float) actor_loss = 0 critic_loss = 0 entropy_loss = 0 next_value = R for value, log_policy, reward, entropy in list( zip(values, log_policies, rewards, entropies))[::-1]: gae = gae * opt.gamma * opt.tau gae = gae + reward + opt.gamma * next_value.detach( ) - value.detach() next_value = value actor_loss = actor_loss + log_policy * gae R = R * opt.gamma + reward critic_loss = critic_loss + (R - value)**2 / 2 entropy_loss = entropy_loss + entropy total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss writer.add_scalar(f"Train_{index}/Loss", total_loss, curr_episode) optimizer.zero_grad() total_loss.backward() for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): if global_param.grad is not None: break global_param._grad = local_param.grad optimizer.step() if curr_episode == int(opt.num_global_steps / opt.num_local_steps): print(f"Training process {index} terminated") if save: end_time = timeit.default_timer() print('The code runs for %.2f s ' % (end_time - start_time)) return
def train(rank, args, T, shared_model, shared_average_model, optimiser): torch.manual_seed(args.seed + rank) # CUDA if args.use_cuda: torch.cuda.manual_seed(args.seed + rank) env = gym.make(args.env) env.seed(args.seed + rank) model = ActorCritic(env.observation_space, env.action_space, args.hidden_size) gpu_id = 0 if args.use_cuda else -1 # todo 0 代表第一个显卡 if gpu_id >= 0: model = model.cuda() model.train() if not args.on_policy: # Normalise memory capacity by number of training processes memory = EpisodicReplayMemory( args.memory_capacity // args.num_processes, args.max_episode_length) t = 1 # Thread step counter done = True # Start new episode while T.value() <= args.T_max: # On-policy episode loop while True: # Sync with shared model at least every t_max steps if gpu_id >= 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) else: model.load_state_dict(shared_model.state_dict()) # Get starting timestep t_start = t # Reset or pass on hidden state if done: avg_hx = torch.zeros(1, args.hidden_size) avg_cx = torch.zeros(1, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(1, args.hidden_size).cuda() cx = torch.zeros(1, args.hidden_size).cuda() else: hx = torch.zeros(1, args.hidden_size) cx = torch.zeros(1, args.hidden_size) # Reset environment and done flag state = state_to_tensor(env.reset()) if gpu_id >= 0: state = state.cuda() done, episode_length = False, 0 else: # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call) hx = hx.detach() cx = cx.detach() # Lists of outputs for training policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], [] while not done and t - t_start < args.t_max: # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) # shared 模型在 CPU上, 需要转换 if gpu_id >= 0: to_avg_state = state.cpu() else: to_avg_state = state average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( to_avg_state, (avg_hx, avg_cx)) # if gpu_id >= 0: # average_policies = average_policies.cuda() # Sample action action = torch.multinomial(policy, 1)[0, 0] # Step next_state, reward, done, _ = env.step(action.item()) next_state = state_to_tensor(next_state) if gpu_id >= 0: next_state = next_state.cuda() reward = args.reward_clip and min(max( reward, -1), 1) or reward # Optionally clamp rewards done = done or episode_length >= args.max_episode_length # Stop episodes at a max length episode_length += 1 # Increase episode counter if not args.on_policy: # Save (beginning part of) transition for offline training memory.append(state, action, reward, policy.detach()) # Save just tensors # Save outputs for online training [ arr.append(el) for arr, el in zip(( policies, Qs, Vs, actions, rewards, average_policies), (policy, Q, V, torch.LongTensor([[action]]), torch.Tensor([[reward]]), average_policy)) ] # Increment counters t += 1 T.increment() # Update state state = next_state # Break graph for last values calculated (used for targets, not directly as model outputs) if done: # Qret = 0 for terminal s Qret = torch.zeros(1, 1) if not args.on_policy: # Save terminal state for offline training memory.append(state, None, None, None) else: # Qret = V(s_i; θ) for non-terminal s _, _, Qret, _ = model(state, (hx, cx)) Qret = Qret.detach().cpu() # Train the network on-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies) # Finish on-policy episode if done: break # Train the network off-policy when enough experience has been collected if not args.on_policy and len(memory) >= args.replay_start: # Sample a number of off-policy episodes based on the replay ratio for _ in range(_poisson(args.replay_ratio)): # Act and train off-policy for a batch of (truncated) episode trajectories = memory.sample_batch(args.batch_size, maxlen=args.t_max) # Reset hidden state avg_hx = torch.zeros(args.batch_size, args.hidden_size) avg_cx = torch.zeros(args.batch_size, args.hidden_size) if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = torch.zeros(args.batch_size, args.hidden_size).cuda() cx = torch.zeros(args.batch_size, args.hidden_size).cuda() else: hx = torch.zeros(args.batch_size, args.hidden_size) cx = torch.zeros(args.batch_size, args.hidden_size) # Lists of outputs for training policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], [] # Loop over trajectories (bar last timestep) for i in range(len(trajectories) - 1): # Unpack first half of transition state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i]), 0) action = torch.LongTensor([ trajectory.action for trajectory in trajectories[i] ]).unsqueeze(1) reward = torch.Tensor([ trajectory.reward for trajectory in trajectories[i] ]).unsqueeze(1) old_policy = torch.cat( tuple(trajectory.policy for trajectory in trajectories[i]), 0) # Calculate policy and values policy, Q, V, (hx, cx) = model(state, (hx, cx)) average_policy, _, _, (avg_hx, avg_cx) = shared_average_model( state, (avg_hx, avg_cx)) # Save outputs for offline training [ arr.append(el) for arr, el in zip((policies, Qs, Vs, actions, rewards, average_policies, old_policies), ( policy, Q, V, action, reward, average_policy, old_policy)) ] # Unpack second half of transition next_state = torch.cat( tuple(trajectory.state for trajectory in trajectories[i + 1]), 0) done = torch.Tensor([ trajectory.action is None for trajectory in trajectories[i + 1] ]).unsqueeze(1) # Do forward pass for all transitions _, _, Qret, _ = model(next_state, (hx, cx)) # Qret = 0 for terminal s, V(s_i; θ) otherwise Qret = ((1 - done) * Qret).detach().cpu() # Train the network off-policy if gpu_id >= 0: Qs = list(map(lambda x: x.cpu(), Qs)) Vs = list(map(lambda x: x.cpu(), Vs)) policies = list(map(lambda x: x.cpu(), policies)) _train(args, T, model, shared_model, shared_average_model, optimiser, policies, Qs, Vs, actions, rewards, Qret, average_policies, old_policies=old_policies) done = True env.close()
class A3C(): '''Implementation of N-step Asychronous Advantage Actor Critic''' def __init__(self, args, env, train=True): self.args = args self.set_random_seeds() self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # Create the environment. self.env = gym.make(env) self.environment_name = env # Setup model. self.policy = ActorCritic(4, self.env.action_space.n) self.policy.apply(self.initialize_weights) # Setup critic model. self.critic = ActorCritic(4, self.env.action_space.n) self.critic.apply(self.initialize_weights) # Setup optimizer. self.eps = 1e-10 # To avoid divide-by-zero error. self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=args.policy_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=args.critic_lr) # Model weights path. self.timestamp = datetime.now().strftime( 'a2c-breakout-%Y-%m-%d_%H-%M-%S') self.weights_path = 'models/%s/%s' % (self.environment_name, self.timestamp) # Load pretrained weights. if args.weights_path: self.load_model() self.policy.to(self.device) self.critic.to(self.device) # Video render mode. if args.render: self.policy.eval() self.generate_episode(render=True) self.plot() return # Data for plotting. self.rewards_data = [] # n * [epoch, mean(returns), std(returns)] # Network training mode. if train: # Tensorboard logging. self.logdir = 'logs/%s/%s' % (self.environment_name, self.timestamp) self.summary_writer = SummaryWriter(self.logdir) # Save hyperparameters. with open(self.logdir + '/training_parameters.json', 'w') as f: json.dump(vars(self.args), f, indent=4) def initialize_weights(self, layer): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): nn.init.xavier_uniform_(layer.weight) nn.init.zeros_(layer.bias) def set_random_seeds(self): torch.manual_seed(self.args.random_seed) np.random.seed(self.args.random_seed) torch.backends.cudnn.benchmark = True def save_model(self, epoch): '''Helper function to save model state and weights.''' if not os.path.exists(self.weights_path): os.makedirs(self.weights_path) torch.save( { 'policy_state_dict': self.policy.state_dict(), 'policy_optimizer': self.policy_optimizer.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict(), 'rewards_data': self.rewards_data, 'epoch': epoch }, os.path.join(self.weights_path, 'model_%d.h5' % epoch)) def load_model(self): '''Helper function to load model state and weights. ''' if os.path.isfile(self.args.weights_path): print('=> Loading checkpoint', self.args.weights_path) self.checkpoint = torch.load(self.args.weights_path) self.policy.load_state_dict(self.checkpoint['policy_state_dict']) self.policy_optimizer.load_state_dict( self.checkpoint['policy_optimizer']) self.critic.load_state_dict(self.checkpoint['critic_state_dict']) self.critic_optimizer.load_state_dict( self.checkpoint['critic_optimizer']) self.rewards_data = self.checkpoint['rewards_data'] else: raise Exception('No checkpoint found at %s' % self.args.weights_path) def train(self): '''Trains the model on a single episode using REINFORCE.''' for epoch in range(self.args.num_episodes): # Generate epsiode data. returns, log_probs, value_function, train_rewards = self.generate_episode( ) self.summary_writer.add_scalar('train/cumulative_rewards', train_rewards, epoch) self.summary_writer.add_scalar('train/trajectory_length', returns.size()[0], epoch) # Compute loss and policy gradient. self.policy_optimizer.zero_grad() policy_loss = ((returns - value_function.detach()) * -log_probs).mean() policy_loss.backward() self.policy_optimizer.step() self.critic_optimizer.zero_grad() critic_loss = F.mse_loss(returns, value_function) critic_loss.backward() self.critic_optimizer.step() # Test the model. if epoch % self.args.test_interval == 0: self.policy.eval() print('\nTesting') rewards = [ self.generate_episode(test=True) for epoch in range(self.args.test_episodes) ] rewards_mean, rewards_std = np.mean(rewards), np.std(rewards) print( 'Test Rewards (Mean): %.3f | Test Rewards (Std): %.3f\n' % (rewards_mean, rewards_std)) self.rewards_data.append([epoch, rewards_mean, rewards_std]) self.summary_writer.add_scalar('test/rewards_mean', rewards_mean, epoch) self.summary_writer.add_scalar('test/rewards_std', rewards_std, epoch) self.policy.train() # Logging. if epoch % self.args.log_interval == 0: print( 'Epoch: {0:05d}/{1:05d} | Policy Loss: {2:.3f} | Value Loss: {3:.3f}' .format(epoch, self.args.num_episodes, policy_loss, critic_loss)) self.summary_writer.add_scalar('train/policy_loss', policy_loss, epoch) self.summary_writer.add_scalar('train/critic_loss', critic_loss, epoch) # Save the model. if epoch % self.args.save_interval == 0: self.save_model(epoch) self.save_model(epoch) self.summary_writer.close() def generate_episode(self, gamma=0.99, test=False, render=False, max_iters=10000): ''' Generates an episode by executing the current policy in the given env. Returns: - a list of states, indexed by time epoch - a list of actions, indexed by time epoch - a list of cumulative discounted returns, indexed by time epoch ''' iters = 0 done = False state = self.env.reset() # Set video save path if render enabled. if render: save_path = 'videos/%s/epoch-%s' % (self.environment_name, self.checkpoint['epoch']) if not os.path.exists(save_path): os.makedirs(save_path) monitor = gym.wrappers.Monitor(self.env, save_path, force=True) batches = [] states = [torch.zeros(84, 84, device=self.device).float()] * 3 rewards, returns = [], [] actions, log_probs = [], [] while not done: # Run policy on current state to log probabilities of actions. states.append( torch.tensor(preprocess(state), device=self.device).float().squeeze(0)) batches.append(torch.stack(states[-4:])) action_probs = self.policy.forward( batches[-1].unsqueeze(0)).squeeze(0) # Sample action from the log probabilities. if test and self.args.det_eval: action = torch.argmax(action_probs) else: action = torch.argmax( torch.distributions.Multinomial( logits=action_probs).sample()) actions.append(action) log_probs.append(action_probs[action]) # Run simulation with current action to get new state and reward. if render: monitor.render() state, reward, done, _ = self.env.step(action.cpu().numpy()) rewards.append(reward) # Break if the episode takes too long. iters += 1 if iters > max_iters: break # Save video and close rendering. cum_rewards = np.sum(rewards) if render: monitor.close() print('\nCumulative Rewards:', cum_rewards) return # Return cumulative rewards for test mode. if test: return cum_rewards # Flip rewards from T-1 to 0. rewards = np.array(rewards) / self.args.reward_normalizer # Compute value. values = [] minibatches = torch.split(torch.stack(batches), 256) for minibatch in minibatches: values.append( self.critic.forward(minibatch, action=False).squeeze(1)) values = torch.cat(values) discounted_values = values * gamma**self.args.n # Compute the cumulative discounted returns. n_step_rewards = np.zeros((1, self.args.n)) for i in reversed(range(rewards.shape[0])): if i + self.args.n >= rewards.shape[0]: V_end = 0 else: V_end = discounted_values[i + self.args.n] n_step_rewards[0, :-1] = n_step_rewards[0, 1:] * gamma n_step_rewards[0, -1] = rewards[i] n_step_return = torch.tensor( n_step_rewards.sum(), device=self.device).unsqueeze(0) + V_end returns.append(n_step_return) # Normalize returns. # returns = torch.stack(returns) # mean_return, std_return = returns.mean(), returns.std() # returns = (returns - mean_return) / (std_return + self.eps) return torch.stack(returns[::-1]).detach().squeeze(1), torch.stack( log_probs), values.squeeze(), cum_rewards def plot(self): # Save the plot. filename = os.path.join( 'plots', *self.args.weights_path.split('/')[-2:]).replace('.h5', '.png') if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) # Make error plot with mean, std of rewards. data = np.asarray(self.rewards_data) plt.errorbar(data[:, 0], data[:, 1], data[:, 2], lw=2.5, elinewidth=1.5, ecolor='grey', barsabove=True, capthick=2, capsize=3) plt.title('Cumulative Rewards (Mean/Std) Plot for A3C Algorithm') plt.xlabel('Number of Episodes') plt.ylabel('Cumulative Rewards') plt.grid() plt.savefig(filename, dpi=300) plt.show()
def train(rank, args, shared_model, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def train(rank, args, shared_model, optimizer=None): mse_loss = torch.nn.MSELoss() nll_loss = torch.nn.NLLLoss() torch.manual_seed(args.seed + rank) env = env_wrapper.create_doom(args.record, outdir=args.outdir) num_outputs = env.action_space.n model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] inverses = [] forwards = [] actions = [] vec_st1s = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx)), icm=False) s_t = state prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) oh_action = torch.Tensor(1, num_outputs) oh_action.zero_() oh_action.scatter_(1, action, 1) oh_action = Variable(oh_action) a_t = oh_action actions.append(oh_action) state, reward, done, _ = env.step(action.numpy()[0][0]) state = torch.from_numpy(state) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) s_t1 = state vec_st1, inverse, forward = model( (Variable(s_t.unsqueeze(0)), Variable(s_t1.unsqueeze(0)), a_t), icm=True) reward_intrinsic = args.eta * ( (vec_st1 - forward).pow(2)).sum(1) / 2. #reward_intrinsic = args.eta * ((vec_st1 - forward).pow(2)).sum(1).sqrt() / 2. reward_intrinsic = reward_intrinsic.data.numpy()[0][0] reward += reward_intrinsic if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) vec_st1s.append(vec_st1) inverses.append(inverse) forwards.append(forward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)), icm=False) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 inverse_loss = 0 forward_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] cross_entropy = -(actions[i] * torch.log(inverses[i] + 1e-15)).sum(1) inverse_loss = inverse_loss + cross_entropy forward_err = forwards[i] - vec_st1s[i] forward_loss = forward_loss + 0.5 * (forward_err.pow(2)).sum(1) optimizer.zero_grad() ((1 - args.beta) * inverse_loss + args.beta * forward_loss).backward(retain_variables=True) (args.lmbda * (policy_loss + 0.5 * value_loss)).backward() #(((1-args.beta) * inverse_loss + args.beta * forward_loss) + args.lmbda * (policy_loss + 0.5 * value_loss)).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()