Exemple #1
0
def grads_func(proc_name, net, device, train_queue):
    envs = [make_env() for _ in range(NUM_ENVS)]
    agent = ptan.agent.PolicyAgent(lambda x: net(x)[0],
                                   device=device,
                                   apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
    batch = []
    frame_idx = 0
    writer = SummaryWriter(comment=proc_name)
    with common.RewardTracker(writer, REWARD_BOUND) as tracker:
        with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker:
            for exp in exp_source:
                frame_idx += 1
                new_rewards = exp_source.pop_total_rewards()
                if new_rewards and tracker.reward(new_rewards[0], frame_idx):
                    break
                batch.append(exp)
                if len(batch) < GRAD_BATCH:
                    continue
                data = unpack_batch(batch,
                                    net,
                                    device=device,
                                    last_val_gamma=GAMMA**REWARD_STEPS)
                states_v, actions_t, vals_ref_v = data
                batch.clear()
                net.zero_grad()
                logits_v, value_v = net(states_v)
                loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)
                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = vals_ref_v - value_v.detach()
                log_p_a = log_prob_v[range(GRAD_BATCH), actions_t]
                log_prob_actions_v = adv_v * log_p_a
                loss_policy_v = -log_prob_actions_v.mean()
                prob_v = F.softmax(logits_v, dim=1)
                ent = (prob_v * log_prob_v).sum(dim=1).mean()
                entropy_loss_v = ENTROPY_BETA * ent
                loss_v = entropy_loss_v + loss_value_v + \
                         loss_policy_v
                loss_v.backward()
                tb_tracker.track("advantage", adv_v, frame_idx)
                tb_tracker.track("values", value_v, frame_idx)
                tb_tracker.track("batch_rewards", vals_ref_v, frame_idx)
                tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx)
                tb_tracker.track("loss_policy", loss_policy_v, frame_idx)
                tb_tracker.track("loss_value", loss_value_v, frame_idx)
                tb_tracker.track("loss_total", loss_v, frame_idx)
                nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
                grads = [
                    param.grad.data.cpu().numpy()
                    if param.grad is not None else None
                    for param in net.parameters()
                ]
                train_queue.put(grads)
    train_queue.put(None)
Exemple #2
0
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = agent.DQNAgent(lambda x: net.qvals(x), selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    eval_states = None
    prev_save = 0
    save_prefix = None

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    break
            if len(buffer) < params['replay_initial']:
                continue

            if eval_states is None:
                eval_states = buffer.sample(STATES_TO_EVALUATE)
Exemple #3
0
    writer = SummaryWriter(comment="-pong-a2c_" + args.name)

    net = AtariA2C(envs[0].observation_space.shape,
                   envs[0].action_space.n).to(device)
    print(net)
    agent = ptan.agent.PolicyAgent(lambda x: net(x)[0],
                                   apply_softmax=True,
                                   device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
    # passing epsilon because the default tends to make the gradient very large which is bad for this method!
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

    batch = []
    with common.RewardTracker(writer, stop_reward=18) as tracker:
        with ptan.common.utils.TBMeanTracker(writer,
                                             batch_size=10) as tb_tracker:
            for step_idx, exp in enumerate(exp_source):
                batch.append(exp)

                # handle new rewards
                new_rewards = exp_source.pop_total_rewards()
                if new_rewards:
                    if tracker.reward(new_rewards[0], step_idx):
                        break

                if len(batch) < BATCH_SIZE:
                    continue

                states_v, actions_t, vals_ref_v = unpack_batch(batch,
Exemple #4
0
            env.action_space.n).to(device)
    net.share_memory()

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

    train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
    proc_list = []
    for _ in range(PROCESSES_COUNT):
        proc = mp.Process(target=data_func, args=(net, device, train_queue))
        proc.start()
        proc_list.append(proc)

    batch = []
    step_idx = 0
    try:
        with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker:
            with common.TBMeanTracker(writer, batch_size=100) as tb_tracker:
                while True:
                    train_entry = train_queue.get()
                    if isinstance(train_entry, TotalReward):
                        if tracker.reward(train_entry.reward, step_idx):
                            break
                        continue

                    step_idx += 1
                    batch.append(train_entry)
                    if len(batch) < BATCH_SIZE:
                        continue

                    states_v, actions_v, qvals_v = unpack_batch(
                        batch, net, device)
Exemple #5
0
 net.share_memory()
 optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)
 train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
 data_proc_list = []
 for _ in range(PROCESSES_COUNT):
     data_proc = mp.Process(target=data_func,
                            args=(net, device, train_queue))
     data_proc.start()
     data_proc_list.append(data_proc)
 batch_states = []
 batch_actions = []
 batch_vals_ref = []
 step_idx = 0
 batch_size = 0
 try:
     with common.RewardTracker(writer, REWARD_BOUND) as tracker:
         with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker:
             while True:
                 train_entry = train_queue.get()
                 if isinstance(train_entry, TotalReward):
                     if tracker.reward(train_entry.reward, step_idx):
                         break
                     continue
                 states_t, actions_t, vals_ref_t = train_entry
                 batch_states.append(states_t)
                 batch_actions.append(actions_t)
                 batch_vals_ref.append(vals_ref_t)
                 step_idx += states_t.size()[0]
                 batch_size += states_t.size()[0]
                 if batch_size < BATCH_SIZE:
                     continue