def grads_func(proc_name, net, device, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], device=device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) batch = [] frame_idx = 0 writer = SummaryWriter(comment=proc_name) with common.RewardTracker(writer, REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker: for exp in exp_source: frame_idx += 1 new_rewards = exp_source.pop_total_rewards() if new_rewards and tracker.reward(new_rewards[0], frame_idx): break batch.append(exp) if len(batch) < GRAD_BATCH: continue data = unpack_batch(batch, net, device=device, last_val_gamma=GAMMA**REWARD_STEPS) states_v, actions_t, vals_ref_v = data batch.clear() net.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_p_a = log_prob_v[range(GRAD_BATCH), actions_t] log_prob_actions_v = adv_v * log_p_a loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) ent = (prob_v * log_prob_v).sum(dim=1).mean() entropy_loss_v = ENTROPY_BETA * ent loss_v = entropy_loss_v + loss_value_v + \ loss_policy_v loss_v.backward() tb_tracker.track("advantage", adv_v, frame_idx) tb_tracker.track("values", value_v, frame_idx) tb_tracker.track("batch_rewards", vals_ref_v, frame_idx) tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx) tb_tracker.track("loss_policy", loss_policy_v, frame_idx) tb_tracker.track("loss_value", loss_value_v, frame_idx) tb_tracker.track("loss_total", loss_v, frame_idx) nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) grads = [ param.grad.data.cpu().numpy() if param.grad is not None else None for param in net.parameters() ] train_queue.put(grads) train_queue.put(None)
epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = agent.DQNAgent(lambda x: net.qvals(x), selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 eval_states = None prev_save = 0 save_prefix = None with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params['replay_initial']: continue if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE)
writer = SummaryWriter(comment="-pong-a2c_" + args.name) net = AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device) print(net) agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], apply_softmax=True, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) # passing epsilon because the default tends to make the gradient very large which is bad for this method! optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3) batch = [] with common.RewardTracker(writer, stop_reward=18) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: for step_idx, exp in enumerate(exp_source): batch.append(exp) # handle new rewards new_rewards = exp_source.pop_total_rewards() if new_rewards: if tracker.reward(new_rewards[0], step_idx): break if len(batch) < BATCH_SIZE: continue states_v, actions_t, vals_ref_v = unpack_batch(batch,
env.action_space.n).to(device) net.share_memory() optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3) train_queue = mp.Queue(maxsize=PROCESSES_COUNT) proc_list = [] for _ in range(PROCESSES_COUNT): proc = mp.Process(target=data_func, args=(net, device, train_queue)) proc.start() proc_list.append(proc) batch = [] step_idx = 0 try: with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker: with common.TBMeanTracker(writer, batch_size=100) as tb_tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue step_idx += 1 batch.append(train_entry) if len(batch) < BATCH_SIZE: continue states_v, actions_v, qvals_v = unpack_batch( batch, net, device)
net.share_memory() optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3) train_queue = mp.Queue(maxsize=PROCESSES_COUNT) data_proc_list = [] for _ in range(PROCESSES_COUNT): data_proc = mp.Process(target=data_func, args=(net, device, train_queue)) data_proc.start() data_proc_list.append(data_proc) batch_states = [] batch_actions = [] batch_vals_ref = [] step_idx = 0 batch_size = 0 try: with common.RewardTracker(writer, REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue states_t, actions_t, vals_ref_t = train_entry batch_states.append(states_t) batch_actions.append(actions_t) batch_vals_ref.append(vals_ref_t) step_idx += states_t.size()[0] batch_size += states_t.size()[0] if batch_size < BATCH_SIZE: continue