Ejemplo n.º 1
0
def calc_loss(batch,
              batch_weights,
              act_net,
              crt_net,
              tgt_act_net,
              tgt_crt_net,
              device='cpu'):
    states, actions, rewards, dones_mask, last_states = utils.unpack_batch(
        batch, device)
    batch_weights = torch.tensor(batch_weights).to(device)

    # critic loss
    crt_distr = crt_net(states, actions)
    last_act = tgt_act_net.target_model(last_states)
    last_distr = F.softmax(tgt_crt_net.target_model(last_states, last_act),
                           dim=1)
    proj_distr = distr_projection(last_distr,
                                  rewards,
                                  dones_mask,
                                  gamma=GAMMA**REWARD_STEPS,
                                  device=device)
    prob_distr = -F.log_softmax(crt_distr, dim=1) * proj_distr
    critic_loss = prob_distr.sum(dim=1).mean()
    td_errors = prob_distr.sum(dim=1) * batch_weights

    # actor loss
    cur_actions = act_net(states)
    crt_distr = crt_net(states, cur_actions)
    actor_loss = -crt_net.distr_to_q(crt_distr)

    return actor_loss.mean(), critic_loss, td_errors + 1e-5
Ejemplo n.º 2
0
def calc_loss_prio(batch,
                   batch_weights,
                   _net,
                   _target_net,
                   gamma,
                   _device="cpu"):
    states, actions, rewards, dones, next_states = utils.unpack_batch(batch)

    states_v = torch.tensor(states).to(_device)
    actions_v = torch.tensor(actions).to(_device)
    rewards_v = torch.tensor(rewards).to(_device)
    done_mask = torch.BoolTensor(dones).to(_device)
    batch_weights_v = torch.tensor(batch_weights).to(_device)

    state_action_values = _net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)

    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(_device)
        next_state_values = _target_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        expected_state_action_values = next_state_values.detach(
        ) * gamma + rewards_v

    losses_v = batch_weights_v * (state_action_values -
                                  expected_state_action_values)**2
    return losses_v.mean(), (losses_v + 1e-5).data.cpu().numpy()
Ejemplo n.º 3
0
def data_func(_net, _device, _train_queue):
    envs = [make_env() for _ in range(NUM_ENVS)]
    agent = ptan.agent.PolicyAgent(lambda x: _net(x)[0],
                                   device=_device,
                                   apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
    micro_batch = []

    for exp in exp_source:
        new_rewards = exp_source.pop_total_rewards()

        if new_rewards:
            data = TotalReward(reward=np.mean(new_rewards))
            _train_queue.put(data)

        micro_batch.append(exp)
        if len(micro_batch) < MICRO_BATCH_SIZE:
            continue

        data = utils.unpack_batch(micro_batch,
                                  _net,
                                  _device=_device,
                                  last_val_gamma=GAMMA**REWARD_STEPS)
        _train_queue.put(data)
        micro_batch.clear()
Ejemplo n.º 4
0
def calc_loss(batch, _net, _target_net, gamma, _device="cpu"):
    states, actions, rewards, dones, next_states = utils.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(_device)
    actions_v = torch.tensor(actions).to(_device)
    next_states_v = torch.tensor(next_states).to(_device)

    next_distr_v, next_qvals_v = _target_net.both(next_states_v)
    next_acts = next_qvals_v.max(1)[1].data.cpu().numpy()
    next_distr = _target_net.apply_softmax(next_distr_v)
    next_distr = next_distr.data.cpu().numpy()

    next_best_distr = next_distr[range(batch_size), next_acts]
    dones = dones.astype(np.bool)

    proj_distr = dqn_extra.distr_projection(next_best_distr, rewards, dones,
                                            gamma)

    distr_v = _net(states_v)
    sa_vals = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(sa_vals, dim=1)
    proj_distr_v = torch.tensor(proj_distr).to(_device)

    loss_v = -state_log_sm_v * proj_distr_v
    return loss_v.sum(dim=1).mean()
Ejemplo n.º 5
0
def calc_loss_double_dqn(batch,
                         _net,
                         _target_net,
                         gamma,
                         _device="cpu",
                         double=True):
    states, actions, rewards, dones, next_states = utils.unpack_batch(batch)

    states_v = torch.tensor(states).to(_device)
    actions_v = torch.tensor(actions).to(_device)
    rewards_v = torch.tensor(rewards).to(_device)
    done_mask = torch.BoolTensor(dones).to(_device)

    actions_v = actions_v.unsqueeze(-1)
    state_action_vals = net(states_v).gather(1, actions_v)
    state_action_vals = state_action_vals.squeeze(-1)

    with torch.no_grad():
        next_states_v = torch.tensor(next_states).to(_device)

        if double:
            next_state_acts = net(next_states_v).max(1)[1]
            next_state_acts = next_state_acts.unsqueeze(-1)
            next_state_vals = _target_net(next_states_v).gather(
                1, next_state_acts).squeeze(-1)
        else:
            next_state_vals = _target_net(next_states_v).max(1)[0]

        next_state_vals[done_mask] = 0.0
        exp_sa_vals = next_state_vals.detach() * gamma + rewards_v

    return nn.MSELoss()(state_action_vals, exp_sa_vals)
Ejemplo n.º 6
0
def calc_double_dqn_loss(batch, net, tgt_net, gamma, device='cpu'):
    """
    Loss function implementation of DeepMind paper.
    
    [Deep Reinforcement Learning with Double Q-Learning
    ([3] van Hasselt, Guez, and Silver, 2015)]
    """
    states, actions, rewards, dones, last_states = utils.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    rewards_v = torch.tensor(rewards).to(device)

    q_state_action_v = net(states_v)[range(len(actions)), actions]
    with torch.no_grad():
        last_states_v = torch.tensor(last_states).to(device)
        next_actions_v = net(last_states_v).argmax(dim=1)
        next_q_state_action_v = tgt_net.target_model(last_states_v)[
            range(len(actions)), next_actions_v]
        next_q_state_action_v[dones] = 0.0
    exp_state_action_v = rewards_v + gamma * next_q_state_action_v
    return F.mse_loss(q_state_action_v, exp_state_action_v)
def main():
    # some setup
    mp.set_start_method('spawn')
    # gym.logger.set_level(40)

    # writer
    timestr = time.strftime("%Y%m%d-%H%M%S")
    if LOAD_MODEL:
        name = f'runs/{NAME}_a3c_continued_{timestr}'
    else:
        name = f'runs/{NAME}_a3c_{timestr}'
    writer = SummaryWriter(name)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Using:', device)

    env = make_env()
    obs_shape = env.observation_space.shape
    print('Observation shape:', obs_shape)
    act_space = env.action_space.n
    print('Action space:', act_space)

    if LOAD_MODEL:
        net = torch.load(LOAD_MODEL)
        print('Model loaded from:', LOAD_MODEL)
    else:
        net = ModelA3C(obs_shape, act_space)
    net = net.to(device)
    env.close(
    )  # our env creates new actors that we don't need, we erase them here
    net.share_memory(
    )  # enabled by default for CUDA, but needs to be enabled explicitly for CPU

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

    train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
    data_proc_list = []

    for _ in range(PROCESSES_COUNT):
        data_proc = mp.Process(target=data_func,
                               args=(net, device, train_queue))
        data_proc.start()
        data_proc_list.append(data_proc)

    batch = []
    time_step = 0

    # add current hyperparameters to TensorBoard
    hparams = {
        'gamma': GAMMA,
        'lr': LEARNING_RATE,
        'entropy_beta': ENTROPY_BETA,
        'batch_size': BATCH_SIZE,
        'steps_count': STEPS_COUNT
    }
    if DO_CLIP_GRAD:
        hparams['clip_grad_threshhold'] = CLIP_GRAD
    writer.add_hparams(hparams, {})

    try:
        start_time = time.time()
        print(f'Training Started - {datetime.datetime.now()}')
        with tracking.RewardTracker(writer,
                                    stop_reward=REWARD_BOUNDRY) as tracker:
            with tracking.TBMeanTracker(writer, batch_size=100) as tb_tracker:
                while True:
                    train_entry = train_queue.get()
                    if isinstance(train_entry, TotalReward):
                        if tracker.reward(train_entry.reward, time_step):
                            break
                        continue

                    time_step += 1
                    batch.append(train_entry)
                    if len(batch) < BATCH_SIZE:
                        continue

                    states_v, actions_t, vals_ref_v = unpack_batch(
                        batch,
                        net,
                        last_val_gamma=GAMMA**STEPS_COUNT,
                        device=device)
                    batch.clear()

                    optimizer.zero_grad()
                    logits_v, value_v = net(states_v)

                    loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                    log_prob_v = F.log_softmax(logits_v, dim=1)
                    adv_v = vals_ref_v - value_v.detach()
                    log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE),
                                                            actions_t]

                    loss_policy_v = -log_prob_actions_v.mean()
                    prob_v = F.softmax(logits_v, dim=1)
                    entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(
                        dim=1).mean()

                    loss_v = entropy_loss_v + loss_value_v + loss_policy_v
                    loss_v.backward()
                    nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
                    optimizer.step()

                    tb_tracker.track("advantage", adv_v, time_step)
                    tb_tracker.track("values", value_v, time_step)
                    tb_tracker.track("batch_rewards", vals_ref_v, time_step)
                    tb_tracker.track("loss_entropy", entropy_loss_v, time_step)
                    tb_tracker.track("loss_policy", loss_policy_v, time_step)
                    tb_tracker.track("loss_value", loss_value_v, time_step)
                    tb_tracker.track("loss_total", loss_v, time_step)

        # save model when training ends
        print(
            f'\nConvergence reached! Solved in {round(time.time() - start_time, 3)} seconds'
        )
        save_path = f'models/model_a3c_{timestr}.pt'
        torch.save(net.cpu(), save_path)
        print('Saved model to:', save_path)

    except KeyboardInterrupt:
        print('Stopped by the user')
        save_path = f'models/model_a3c_stopped_{timestr}.pt'
        torch.save(net.cpu(), save_path)
        print('Saved model to:', save_path)

    except Exception as e:
        print('Training Crushed:')
        traceback.print_exc()
        save_path = f'models/model_a3c_error_{timestr}.pt'
        torch.save(net.cpu(), save_path)
        print('Saved model to:', save_path)

    finally:
        # writer.flush()
        for p in data_proc_list:
            p.terminate()
            p.join()

        torch.cuda.empty_cache()
Ejemplo n.º 8
0
def grads_func(_proc_name, _net, _device, _train_queue):
    envs = [make_env() for _ in range(NUM_ENVS)]

    agent = ptan.agent.PolicyAgent(lambda x: _net(x)[0], device=_device, apply_softmax=True)
    exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS)

    _batch = []
    frame_idx = 0
    writer = SummaryWriter(comment=_proc_name)

    with utils.RewardTracker(writer, REWARD_BOUND) as tracker:
        with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker:
            for exp in exp_source:
                frame_idx += 1
                new_rewards = exp_source.pop_total_rewards()

                if new_rewards and tracker.reward(new_rewards[0], frame_idx):
                    break

                _batch.append(exp)
                if len(_batch) < GRAD_BATCH:
                    continue

                data = utils.unpack_batch(_batch, _net, device=_device, last_val_gamma=GAMMA ** REWARD_STEPS)
                states_v, actions_t, vals_ref_v = data

                _batch.clear()

                _net.zero_grad()
                logits_v, value_v = _net(states_v)
                loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = vals_ref_v - value_v.detach()
                log_p_a = log_prob_v[range(GRAD_BATCH), actions_t]
                log_prob_actions_v = adv_v * log_p_a
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v, dim=1)
                ent = (prob_v * log_prob_v).sum(dim=1).mean()
                entropy_loss_v = ENTROPY_BETA * ent

                loss_v = entropy_loss_v + loss_value_v + loss_policy_v
                loss_v.backward()

                tb_tracker.track("advantage", adv_v, frame_idx)
                tb_tracker.track("values", value_v, frame_idx)
                tb_tracker.track("batch_rewards", vals_ref_v, frame_idx)
                tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx)
                tb_tracker.track("loss_policy", loss_policy_v, frame_idx)
                tb_tracker.track("loss_value", loss_value_v, frame_idx)
                tb_tracker.track("loss_total", loss_v, frame_idx)

                nn_utils.clip_grad_norm_(_net.parameters(), CLIP_GRAD)
                grads = [
                    param.grad.data.cpu().numpy()
                    if param.grad is not None else None
                    for _param in _net.parameters()
                ]
                _train_queue.put(grads)

    _train_queue.put(None)
Ejemplo n.º 9
0
def main():
    # some setup
    mp.set_start_method('spawn')
    gym.logger.set_level(40)

    # writer
    timestr = time.strftime("%Y%m%d-%H%M%S")
    if LOAD_MODEL:
        name = f'runs/{NAME}_a3c_continued_{timestr}'
    else:
        name = f'runs/{NAME}_a3c_{timestr}'
    writer = SummaryWriter()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Using:', device)
    
    env = make_env()
    obs_shape = env.observation_space.shape
    print('Observation shape:', obs_shape)
    act_space = env.action_space.shape
    print('Action space:', act_space)

    if LOAD_MODEL:
        net = torch.load(LOAD_MODEL)
        print('Model loaded from:', LOAD_MODEL)
    else:
        net = ModelA3C(obs_shape[0], act_space[0])

    net = net.to(device)
    env.close()  # our env creates new actors that we don't need, we erase them here
    test_env = make_env()  # env to pass to the testing function
    net.share_memory()  # enabled by default for CUDA, but needs to be enabled explicitly for CPU

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    train_queue = mp.Queue(maxsize=PROCESSES_COUNT)
    data_proc_list = []

    for _ in range(PROCESSES_COUNT):
        data_proc = mp.Process(target=data_func, args=(net, device, train_queue))
        data_proc.start()
        data_proc_list.append(data_proc)
    
    batch = []
    best_reward = None
    time_step = 0

    # add current hyperparameters to TensorBoard
    hparams = {'gamma': GAMMA, 'lr': LEARNING_RATE, 'entropy_beta': ENTROPY_BETA,
        'batch_size': BATCH_SIZE, 'steps_count': STEPS_COUNT}
    writer.add_hparams(hparams, {})
    
    try:
        start_time = time.time()
        print(f'Training Started - {datetime.datetime.now()}')
        with tracking.RewardTracker(writer) as tracker:
            with tracking.TBMeanTracker(writer, batch_size=10) as tb_tracker:
                while True:
                    # Tracking
                    train_entry = train_queue.get()
                    if isinstance(train_entry, RewardSteps):
                        rewards_steps = train_entry.reward
                        rewards, steps = zip(*rewards_steps)
                        tb_tracker.track('episode_steps', steps[0], time_step)
                        tracker.reward(rewards[0], time_step)
                        continue  # wrong type, we don't want total rewards in our batch

                    time_step += 1
                    
                    # Testing and updating the best model
                    if time_step % TEST_ITERS == 0:
                        ts = time.time()
                        rewards, steps = test_net(net, test_env, device=device)
                        msg_str = "Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps)
                        if best_reward is not None:
                            msg_str += f' Current Best {round(best_reward, 3)}'
                        print(msg_str)
                        writer.add_scalar("test_reward", rewards, time_step)
                        writer.add_scalar("test_steps", steps, time_step)
                        if best_reward is None or best_reward < rewards:
                            if best_reward is not None:
                                print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards))
                                # name = "best_%+.3f_%d.dat" % (rewards, time_step)
                                save_path = f'models/best_model_a3c_{timestr}.pt'
                                # fname = os.path.join(save_path, name)
                                torch.save(net, save_path)
                            best_reward = rewards

                    
                    batch.append(train_entry)
                    if len(batch) < BATCH_SIZE:
                        continue
                    
                    # Training
                    states_v, actions_v, vals_ref_v = unpack_batch(batch, net, last_val_gamma=GAMMA**STEPS_COUNT, device=device)
                    batch.clear()
                    # print('batch', states_v.shape, actions_v.shape, vals_ref_v.shape)

                    optimizer.zero_grad()
                    mu_v, var_v, value_v = net(states_v)
                    # print('net', mu_v.shape, var_v.shape, value_v.shape)
                    loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                    adv_v = vals_ref_v.unsqueeze(dim=-1) - value_v.detach()
                    log_prob_v = adv_v * calc_logprob(mu_v, var_v, actions_v)  # .unsqueeze(-1))
                    loss_policy_v = -log_prob_v.mean()
                    entropy_loss_v = ENTROPY_BETA * (-(torch.log(2*math.pi*var_v) + 1)/2).mean()

                    loss_v = loss_policy_v + entropy_loss_v + loss_value_v
                    loss_v.backward()
                    optimizer.step()

                    tb_tracker.track("advantage", adv_v, time_step)
                    tb_tracker.track("values", value_v, time_step)
                    tb_tracker.track("batch_rewards", vals_ref_v, time_step)
                    tb_tracker.track("loss_entropy", entropy_loss_v, time_step)
                    tb_tracker.track("loss_policy", loss_policy_v, time_step)
                    tb_tracker.track("loss_value", loss_value_v, time_step)
                    tb_tracker.track("loss_total", loss_v, time_step)
                    
        # save model when training ends
        print(f'\nConvergence reached! Solved in {round(time.time() - start_time, 3)} seconds')
        save_path = f'models/model_a3c_{timestr}.pt'
        torch.save(net.cpu(), save_path)
        print('Saved model to:', save_path)
    
    except KeyboardInterrupt:
        print('Stopped by the user')
        save_path = f'models/model_a3c_stopped_{timestr}.pt'
        torch.save(net.cpu(), save_path)
        print('Saved model to:', save_path)

    except Exception as e:
        print('Training Crushed:')
        traceback.print_exc()
        save_path = f'models/model_a3c_error_{timestr}.pt'
        torch.save(net.cpu(), save_path)
        print('Saved model to:', save_path)

    finally:
        # writer.flush()
        for p in data_proc_list:
            p.terminate()
            p.join()
        
        torch.cuda.empty_cache()