コード例 #1
0
ファイル: ppo.py プロジェクト: frangipane/dm_construction
def ppo(task,
        actor_critic=model.ActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        lr=3e-4,
        v_loss_coeff=0.5,
        train_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        wrapper_type="continuous_absolute",
        log_wandb=False):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = dm_construction.get_environment(task, wrapper_type=wrapper_type)
    obs_dim = env.observation_spec().shape
    if wrapper_type == "continuous_absolute":
        act_dim = 4  # for continuous absolute action space
    else:
        raise NotImplementedError

    # Create actor-critic module
    ac = actor_critic(env.observation_spec(), env.action_spec(), **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = count_vars(ac.ac)
    logger.log(f"\nNumber of parameters: \t {var_counts}")

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    def compute_loss(data):
        obs, act, adv, logp_old, ret = data['obs'], data['act'], data[
            'adv'], data['logp'], data['ret']
        pi, v, logp = ac.ac(obs, act)

        # value loss (just MSE)
        loss_v = ((v - ret)**2).mean()

        # policy loss
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # useful extra info re: policy
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_v, loss_pi, pi_info

    # Set up optimizers for policy and value function
    optimizer = Adam(ac.ac.parameters(), lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        v_l_old, pi_l_old, pi_info_old = compute_loss(data)
        pi_l_old = pi_l_old.item()
        vl_l_old = v_l_old.item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_iters):
            optimizer.zero_grad()
            loss_v, loss_pi, pi_info = compute_loss(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    f'Early stopping at step {i} due to reaching max kl.')
                break

            loss = loss_pi + loss_v * v_loss_coeff
            loss.backward()
            mpi_avg_grads(ac.ac)  # average grads across MPI processes
            optimizer.step()

        logger.store(StopIter=i)

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    timestep, ep_ret, ep_len = env.reset(difficulty=0), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        encountered_terminal = False
        for t in range(local_steps_per_epoch):
            # assumes obs is an rgb array: rescale to [0, 1]
            o = timestep.observation / 255.0

            a, v, logp = ac.step(o)

            next_timestep = env.step(ac.action_to_dict(a, rescale=True))
            r = timestep.reward
            d = next_timestep.last(
            )  # TODO: check if r, d are assoc w/ correct timestep
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # TODO debugging
            logger.store(AHor=a[0])
            logger.store(AVer=a[1])
            logger.store(ASel=a[3])

            # Update obs (critical!)
            timestep = next_timestep

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print(
                        f'Warning: trajectory cut off by epoch at {ep_len} steps.',
                        flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(timestep.observation / 255.0)
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished.
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    encountered_terminal = True
                timestep, ep_ret, ep_len = env.reset(difficulty=0), 0, 0

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        if encountered_terminal:
            # Note, if local_steps_per_epoch is too small so no terminal state
            # has been encountered, then ep_ret and ep_len will not
            # be stored before call to log_tabular, resulting in error.
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)

        # TODO debugging
        logger.log_tabular('AHor', with_min_and_max=True)
        logger.log_tabular('AVer', with_min_and_max=True)
        logger.log_tabular('ASel', with_min_and_max=True)

        # Save model
        if (epoch % save_freq == 0 and epoch > 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

            if proc_id() == 0 and log_wandb:
                # Save the model parameters to wandb every save_freq epoch
                # instead of waiting till the end
                state = {
                    'epoch': epoch,
                    'ac_state_dict': ac.ac.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }
                # output the model in the wandb.run.dir to avoid problems
                # syncing the model in the cloud with wandb's files
                state_fname = os.path.join(wandb.run.dir, "state_dict.pt")
                torch.save(state, state_fname)

        if proc_id() == 0 and log_wandb:
            wandb.log(logger.log_current_row, step=epoch)
        logger.dump_tabular()
コード例 #2
0
def td3(env_fn: Callable,
        actor_critic: torch.nn.Module = core.MLPActorCritic,
        ac_kwargs: Dict = None,
        seed: int = 0,
        steps_per_epoch: int = 4000,
        epochs: int = 2000,
        replay_size: int = int(1e6),
        gamma: float = 0.99,
        polyak: float = 0.995,
        pi_lr: Union[Callable, float] = 1e-3,
        q_lr: Union[Callable, float] = 1e-3,
        batch_size: int = 100,
        start_steps: int = 10000,
        update_after: int = 1000,
        update_every: int = 100,
        act_noise: Union[Callable, float] = 0.1,
        target_noise: float = 0.2,
        noise_clip: float = 0.5,
        policy_delay: int = 2,
        num_test_episodes: int = 3,
        max_ep_len: int = 1000,
        logger_kwargs: Dict = None,
        save_freq: int = 1,
        random_exploration: Union[Callable, float] = 0.0,
        save_checkpoint_path: str = None,
        load_checkpoint_path: str = None,
        load_model_file: str = None):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float or callable): Learning rate for policy.

        q_lr (float or callable): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float or callable): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        random_exploration (float or callable): Probability to randomly select
            an action instead of selecting from policy.

        save_checkpoint_path (str): Path to save the model. If not set, no model
            will be saved

        load_checkpoint_path (str): Path to load the model. Cannot be set if
            save_model_path is set.
    """
    if logger_kwargs is None:
        logger_kwargs = dict()
    if ac_kwargs is None:
        ac_kwargs = dict()

    if save_checkpoint_path is not None:
        assert load_checkpoint_path is None, "load_model_path cannot be set when save_model_path is already set"
        if not os.path.exists(save_checkpoint_path):
            print(f"Folder {save_checkpoint_path} does not exist, creating...")
            os.makedirs(save_checkpoint_path)

    if load_checkpoint_path is not None:
        assert load_model_file is None, "load_checkpoint_path cannot be set when load_model_file is already set"
    # ------------ Initialisation begin ------------
    loaded_state_dict = None
    if load_checkpoint_path is not None:
        logger = EpochLogger(**logger_kwargs)
        logger.save_config(locals())
        loaded_state_dict = load_latest_state_dict(load_checkpoint_path)

        logger.epoch_dict = loaded_state_dict['logger_epoch_dict']
        q_learning_rate_fn = loaded_state_dict['q_learning_rate_fn']
        pi_learning_rate_fn = loaded_state_dict['pi_learning_rate_fn']
        epsilon_fn = loaded_state_dict['epsilon_fn']
        act_noise_fn = loaded_state_dict['act_noise_fn']
        replay_buffer = loaded_state_dict['replay_buffer']
        env, test_env = loaded_state_dict['env'], loaded_state_dict['test_env']
        ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
        ac_targ = deepcopy(ac)
        ac.load_state_dict(loaded_state_dict['ac'])
        ac_targ.load_state_dict(loaded_state_dict['ac_targ'])
        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape[0]
        env.action_space.np_random.set_state(
            loaded_state_dict['action_space_state'])

        # List of parameters for both Q-networks (save this for convenience)
        q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())
        t_ori = loaded_state_dict['t']
        pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(t_ori))
        pi_optimizer.load_state_dict(loaded_state_dict['pi_optimizer'])
        q_optimizer = Adam(q_params, lr=q_learning_rate_fn(t_ori))
        q_optimizer.load_state_dict(loaded_state_dict['q_optimizer'])
        np.random.set_state(loaded_state_dict['np_rng_state'])
        torch.set_rng_state(loaded_state_dict['torch_rng_state'])

    else:
        logger = EpochLogger(**logger_kwargs)
        logger.save_config(locals())

        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)

        q_learning_rate_fn = get_schedule_fn(q_lr)
        pi_learning_rate_fn = get_schedule_fn(pi_lr)
        act_noise_fn = get_schedule_fn(act_noise)
        epsilon_fn = get_schedule_fn(random_exploration)

        env, test_env = env_fn(), env_fn()
        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape[0]

        env.action_space.seed(seed)

        # Experience buffer
        replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                     act_dim=act_dim,
                                     size=replay_size)

        # Create actor-critic module and target networks
        if load_model_file is not None:
            assert os.path.exists(
                load_model_file
            ), f"Model file path does not exist: {load_model_file}"
            ac = torch.load(load_model_file)
        else:
            ac = actor_critic(env.observation_space, env.action_space,
                              **ac_kwargs)
        ac_targ = deepcopy(ac)

        # List of parameters for both Q-networks (save this for convenience)
        q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

        # Set up optimizers for policy and q-function
        pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(0))
        q_optimizer = Adam(q_params, lr=q_learning_rate_fn(0))
        t_ori = 0

    act_limit = 1.0

    # ------------ Initialisation end ------------

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    torch.set_printoptions(profile="default")

    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            pi_targ = ac_targ.pi(o2)
            # Target policy smoothing
            epsilon = torch.randn_like(pi_targ) * target_noise
            epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = torch.clamp(a2, -act_limit, act_limit)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.detach().numpy(),
                         Q2Vals=q2.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q1_pi = ac.q1(o, ac.pi(o))
        return -q1_pi.mean()

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

            # Freeze Q-networks so you don't waste computational effort
            # computing gradients for them during the policy learning step.
            for p in q_params:
                p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            pi_optimizer.step()

            # Unfreeze Q-networks so you can optimize it at next DDPG step.
            for p in q_params:
                p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for _ in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                scaled_action = get_action(o, 0)
                o, r, d, _ = test_env.step(
                    unscale_action(env.action_space, scaled_action))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    if loaded_state_dict is not None:
        o = loaded_state_dict['o']
        ep_ret = loaded_state_dict['ep_ret']
        ep_len = loaded_state_dict['ep_len']
    else:
        o, ep_ret, ep_len = env.reset(), 0, 0
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        t += t_ori
        # printMemUsage(f"start of step {t}")
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps and np.random.rand() > epsilon_fn(t):
            a = get_action(o, act_noise_fn(t))
            unscaled_action = unscale_action(env.action_space, a)
        else:
            unscaled_action = env.action_space.sample()
            a = scale_action(env.action_space, unscaled_action)
        # Step the env
        o2, r, d, _ = env.step(unscaled_action)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, timer=j)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            # Perform LR decay
            update_learning_rate(q_optimizer, q_learning_rate_fn(t))
            update_learning_rate(pi_optimizer, pi_learning_rate_fn(t))
            epoch = (t + 1) // steps_per_epoch

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # Save model and checkpoint
            save_checkpoint = False
            checkpoint_path = ""
            if save_checkpoint_path is not None:
                save_checkpoint = True
                checkpoint_path = save_checkpoint_path
            if load_checkpoint_path is not None:
                save_checkpoint = True
                checkpoint_path = load_checkpoint_path
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({}, None)

                if save_checkpoint:
                    checkpoint_file = os.path.join(checkpoint_path,
                                                   f'save_{epoch}.pt')
                    torch.save(
                        {
                            'ac':
                            ac.state_dict(),
                            'ac_targ':
                            ac_targ.state_dict(),
                            'replay_buffer':
                            replay_buffer,
                            'pi_optimizer':
                            pi_optimizer.state_dict(),
                            'q_optimizer':
                            q_optimizer.state_dict(),
                            'logger_epoch_dict':
                            logger.epoch_dict,
                            'q_learning_rate_fn':
                            q_learning_rate_fn,
                            'pi_learning_rate_fn':
                            pi_learning_rate_fn,
                            'epsilon_fn':
                            epsilon_fn,
                            'act_noise_fn':
                            act_noise_fn,
                            'torch_rng_state':
                            torch.get_rng_state(),
                            'np_rng_state':
                            np.random.get_state(),
                            'action_space_state':
                            env.action_space.np_random.get_state(),
                            'env':
                            env,
                            'test_env':
                            test_env,
                            'ep_ret':
                            ep_ret,
                            'ep_len':
                            ep_len,
                            'o':
                            o,
                            't':
                            t + 1
                        }, checkpoint_file)
                    delete_old_files(checkpoint_path, 10)
コード例 #3
0
ファイル: egl.py プロジェクト: eladsar/spinningup
def egl(env_fn,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=256,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        eps=0.4,
        n_explore=32,
        device='cuda',
        architecture='mlp',
        sample='on_policy'):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    if architecture == 'mlp':
        actor_critic = core.MLPActorCritic
    elif architecture == 'spline':
        actor_critic = core.SplineActorCritic
    else:
        raise NotImplementedError

    device = torch.device(device)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2, ac.geps])
    logger.log(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t geps: %d\n'
        % var_counts)

    n_samples = 100
    cmin = 0.25
    cmax = 1.75
    greed = 0.01
    rand = 0.01

    def max_reroute(o):

        b, _ = o.shape
        o = repeat_and_reshape(o, n_samples)
        with torch.no_grad():
            ai, _ = ac.pi(o)

            q1 = ac.q1(o, ai)
            q2 = ac.q2(o, ai)
            qi = torch.min(q1, q2).unsqueeze(-1)

        qi = qi.view(n_samples, b, 1)
        ai = ai.view(n_samples, b, act_dim)
        rank = torch.argsort(torch.argsort(qi, dim=0, descending=True),
                             dim=0,
                             descending=False)
        w = cmin * torch.ones_like(ai)
        m = int((1 - cmin) * n_samples / (cmax - cmin))

        w += (cmax - cmin) * (rank < m).float()
        w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float()

        w -= greed
        w += greed * n_samples * (rank == 0).float()

        w = w * (1 - rand) + rand

        w = w / w.sum(dim=0, keepdim=True)

        prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0))

        a = torch.gather(ai.permute(1, 2, 0), 2,
                         prob.sample().unsqueeze(2)).squeeze(2)

        return a, (ai, w.mean(-1))

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # # Set up function for computing EGL mean-gradient-losses
    # def compute_loss_g(data):
    #
    #     o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
    #
    #     a2 = ball_explore(a1, n_explore, eps)
    #
    #     a2 = a2.view(n_explore * len(r), act_dim)
    #     o_expand = repeat_and_reshape(o, n_explore)
    #
    #     # Bellman backup for Q functions
    #     with torch.no_grad():
    #
    #         q1 = ac.q1(o_expand, a2)
    #         q2 = ac.q2(o_expand, a2)
    #         q_dither = torch.min(q1, q2)
    #
    #         # Target actions come from *current* policy
    #         a_tag, logp_a_tag = ac.pi(o_tag)
    #
    #         # Target Q-values
    #         q1_pi_targ = ac_targ.q1(o_tag, a_tag)
    #         q2_pi_targ = ac_targ.q2(o_tag, a_tag)
    #         q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
    #         q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag)
    #
    #         q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1)
    #
    #     geps = ac.geps(o, a1)
    #     geps = repeat_and_reshape(geps, n_explore)
    #     a1 = repeat_and_reshape(a1, n_explore)
    #
    #     geps = (geps * (a2 - a1)).sum(-1)
    #     # l1 loss against Bellman backup
    #
    #     loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor)
    #
    #     # Useful info for logging
    #     g_info = dict(GVals=geps.flatten().detach().cpu().numpy())
    #
    #     return loss_g, g_info

    # Set up function for computing EGL mean-gradient-losses
    def compute_loss_g(data):
        o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        a2 = ball_explore(a1, n_explore, eps)

        a2 = a2.view(n_explore * len(r), act_dim)
        o_expand = repeat_and_reshape(o, n_explore)

        # Bellman backup for Q functions
        with torch.no_grad():
            q1 = ac.q1(o_expand, a2)
            q2 = ac.q2(o_expand, a2)
            q_dither = torch.min(q1, q2)

            # Target actions come from *current* policy

            # Target Q-values
            q1 = ac.q1(o, a1)
            q2 = ac.q2(o, a1)
            q_anchor = torch.min(q1, q2)

            q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1)

        geps = ac.geps(o, a1)
        geps = repeat_and_reshape(geps, n_explore)
        a1 = repeat_and_reshape(a1, n_explore)

        geps = (geps * (a2 - a1)).sum(-1)
        # l1 loss against Bellman backup

        loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor)

        # Useful info for logging
        g_info = dict(GVals=geps.flatten().detach().cpu().numpy())

        return loss_g, g_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        geps_pi = ac.geps(o, pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - (geps_pi * pi).sum(-1)).mean()

        beta = autograd.Variable(pi.detach().clone(), requires_grad=True)
        q1_pi = ac.q1(o, beta)
        q2_pi = ac.q2(o, beta)
        qa = torch.min(q1_pi, q2_pi).unsqueeze(-1)

        grad_q = autograd.grad(outputs=qa,
                               inputs=beta,
                               grad_outputs=torch.cuda.FloatTensor(
                                   qa.size()).fill_(1.),
                               create_graph=False,
                               retain_graph=False,
                               only_inputs=True)[0]

        # Useful info for logging
        pi_info = dict(
            LogPi=logp_pi.detach().cpu().numpy(),
            GradGAmp=torch.norm(geps_pi, dim=-1).detach().cpu().numpy(),
            GradQAmp=torch.norm(grad_q, dim=-1).detach().cpu().numpy(),
            GradDelta=torch.norm(geps_pi - grad_q,
                                 dim=-1).detach().cpu().numpy(),
            GradSim=F.cosine_similarity(geps_pi, grad_q,
                                        dim=-1).detach().cpu().numpy(),
        )

        return loss_pi, pi_info

    if architecture == 'mlp':
        # Set up optimizers for policy and q-function
        pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
        q_optimizer = Adam(q_params, lr=lr)
        g_optimizer = Adam(ac.geps.parameters(), lr=lr)
    elif architecture == 'spline':
        # Set up optimizers for policy and q-function
        pi_optimizer = SparseDenseAdamOptimizer(ac.pi,
                                                dense_args={'lr': lr},
                                                sparse_args={'lr': 10 * lr})
        q_optimizer = SparseDenseAdamOptimizer([ac.q1, ac.q2],
                                               dense_args={'lr': lr},
                                               sparse_args={'lr': 10 * lr})
        g_optimizer = SparseDenseAdamOptimizer(ac.geps,
                                               dense_args={'lr': lr},
                                               sparse_args={'lr': 10 * lr})
    else:
        raise NotImplementedError

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):

        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Next run one gradient descent step for the mean-gradient
        g_optimizer.zero_grad()
        loss_g, g_info = compute_loss_g(data)
        loss_g.backward()
        g_optimizer.step()

        # Record things
        logger.store(LossG=loss_g.item(), **g_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in ac.geps.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in ac.geps.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action_on_policy(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device),
                      deterministic)

    def get_action_rbi(o, deterministic=False):
        o = torch.as_tensor(o, dtype=torch.float32, device=device)
        if deterministic:
            a = ac.act(o, deterministic)
        else:
            o = o.unsqueeze(0)
            a, _ = max_reroute(o)
            a = a.flatten().cpu().numpy()
        return a

    if sample == 'on_policy':
        get_action = get_action_on_policy
    elif sample == 'rbi':
        get_action = get_action_rbi
    else:
        raise NotImplementedError

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in tqdm(range(total_steps)):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)

            logger.log_tabular('GVals', with_min_and_max=True)
            logger.log_tabular('LossG', with_min_and_max=True)
            logger.log_tabular('GradGAmp', with_min_and_max=True)
            logger.log_tabular('GradQAmp', with_min_and_max=True)
            logger.log_tabular('GradDelta', with_min_and_max=True)
            logger.log_tabular('GradSim', with_min_and_max=True)

            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #4
0
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, 
        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, 
        logger_kwargs=dict(), save_freq=1):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False
        
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)


    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space below.                     #
    #                                                                         #
    #=========================================================================#

    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        # Compute target actions
        a_next = ac_targ.pi(torch.as_tensor(o2, dtype=torch.float32))
        a_next += torch.clamp(target_noise * torch.randn(act_dim), -noise_clip, noise_clip)
        a_next = torch.clamp(a_next, -act_limit, act_limit)

        # Compute targets
        q1 = ac_targ.q1(o2, a_next)
        q2 = ac_targ.q2(o2, a_next)
        y = r + gamma * (1 - d) * torch.min(q1, q2)
        
        # Loss function
        loss_q1 = ((ac.q1(o, a) - y) ** 2).mean()
        loss_q2 = ((ac.q2(o, a) - y) ** 2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.detach().numpy(),
                         Q2Vals=q2.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o = torch.as_tensor(data['obs'], dtype=torch.float32)
        loss_pi = -ac.q1(o, ac.pi(o)).mean() # Gradient ascent
        return loss_pi

    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space above.                     #
    #                                                                         #
    #=========================================================================#

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(q_params, lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

            # Freeze Q-networks so you don't waste computational effort 
            # computing gradients for them during the policy learning step.
            for p in q_params:
                p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            pi_optimizer.step()

            # Unfreeze Q-networks so you can optimize it at next DDPG step.
            for p in q_params:
                p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, timer=j)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #5
0
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=1,
         steps_per_epoch=2000, epochs=10000, replay_size=int(1e5), gamma=0.99,
         polyak=0.995, pi_lr=1e-4, q_lr=1e-4, batch_size=128, start_steps=2000,
         update_after=1000, update_every=1000, act_noise=0.05, num_test_episodes=1,
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    rospy.init_node('DDPG_Train')
    env = env_fn()

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]
    print(f"[DDPG] obs dim: {obs_dim} action dim: {act_dim}")

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    # ac.apply(init_weights)
    ac_targ = deepcopy(ac)
    ac.eval()  # in-active training BN
    print(f"[MODEL] Actor_Critic: {ac}")

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts)

    # Set up function for computing DDPG Q-loss
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        # import ipdb
        # ipdb.set_trace()
        q = ac.q(o, a)

        # Bellman backup for Q function
        with torch.no_grad():
            q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q - backup)**2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.cpu().detach().numpy())

        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q_pi = ac.q(o, ac.pi(o))
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort 
        # computing gradients for it during the policy learning step.
        for p in ac.q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in ac.q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

    def soft_target_update():
        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        o = torch.as_tensor(o, dtype=torch.float32)
        if o.dim() == 1:
            o = o.unsqueeze(0)
        a = ac.act(o)[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, env.act_limit_min, env.act_limit_max)

    def test_agent():
        print("[DDPG] eval......")
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = env.reset(), False, 0, 0
            # while not(d or (ep_len == max_ep_len)):
            while not(d or (ep_len == 100)):
                # Take deterministic actions at test time (noise_scale=0)
                a = get_action(o, 0)
                print(f"[Eval] a: {a}")
                o, r, d, _ = env.step(a)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise).

        print(f"O {o[-4]:.3f} {o[-3]:.3f} {o[-2]:.3f} {o[-1]:.3f} ")
        if t > start_steps:
            # if np.random.rand() > 0.3:
            a = get_action(o, act_noise)
            # else:
            # a = env.action_space.sample()
        else:
            a = env.action_space.sample()
        print(f't {t:7.0f} | a [{a[0]:.3f},{a[1]:.3f}]')

        # Step the env
        o2, r, d, info = env.step(a)
        # print(f"O {o[-4:]} |A {a} |O2 {o2[-4:]} |R {r} |D {d} |Info {info}")
        print(f"          ------------------> R: {r:.3f}")
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            env.pause_pedsim()
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
            env.unpause_pedsim()


        # Update handling
        if t >= update_after and t % update_every == 0:
            env.pause_pedsim()
            ac.train()  # active training BN
            ac_targ.train()
            if torch.cuda.is_available():
                ac.cuda()
                ac_targ.cuda()
            for _ in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                if torch.cuda.is_available():
                    for key, value in batch.items():
                        batch[key] = value.cuda()
                update(data=batch)
                soft_target_update()
            ac.eval()
            ac_targ.eval()
            if torch.cuda.is_available():
                ac.cpu()
                ac_targ.cpu()
            env.unpause_pedsim()

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()
            o, d, ep_ret, ep_len = env.reset(), False, 0, 0

            sec = time.time() - start_time
            elapsed_time = str(datetime.timedelta(seconds=sec)).split('.')[0]


            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            # logger.log_tabular('Time', time.time()-start_time)
            logger.log_tabular('Time', elapsed_time)
            logger.dump_tabular()
コード例 #6
0
def a2c(env_fn,
        agent: Agent,
        seed=0,
        num_cpu=1,
        device=torch.device("cpu"),
        epochs=1000,
        steps_per_epoch=100,
        gamma=0.99,
        use_gae=True,
        tau=0.95,
        max_grad_norm=0.5,
        polyak=0.995,
        learning_rate=1e-3,
        value_loss_coef=0.5,
        policy_loss_coef=1,
        entropy_loss_coef=0.1,
        grid_layer_weight_reg_loss_coef=1e-4,
        save_every=100,
        log_every=10,
        logger_kwargs=dict(),
        test_every=100,
        num_test_episodes=5,
        deterministic=False,
        save_freq=1,
        solved_score=None,
        render=False,
        ):
    use_MPI = num_cpu > 1

    if use_MPI:
        # Special function to avoid certain slowdowns from PyTorch + MPI combo.
        mpi_pytorch.setup_pytorch_for_mpi()
    else:
        torch.set_num_threads(torch.get_num_threads())

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    config = locals()
    del config['env_fn']
    del config['agent']
    del config['logger']
    logger.save_config(config)

    test_logger_kwargs = deepcopy(logger_kwargs)
    test_logger_kwargs['output_dir'] = pathlib.Path(test_logger_kwargs['output_dir']) / 'evaluation'
    test_logger = EpochLogger(**test_logger_kwargs)

    # Random seed
    if use_MPI:
        seed += 10000 * mpi_tools.proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()

    assert env.max_episode_steps > 0

    obs_shape = env.observation_space.shape
    act_dim = env.action_space.n

    # training model and target model
    target_agent = deepcopy(agent)
    if use_MPI:
        # Sync params across processes
        mpi_pytorch.sync_params(agent)
        mpi_pytorch.sync_params(target_agent)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in target_agent.parameters():
        p.requires_grad = False

    # Utilize GPU
    agent.to(device)
    target_agent.to(device)

    # Set up optimizers for policy and q-function
    optimizer = Adam(agent.parameters(), lr=learning_rate)

    # Set up model saving
    logger.setup_pytorch_saver(agent, name='model')

    def update(episode_buffer):
        # Update
        if episode_buffer.dones[-1]:
            next_value = 0.0
        else:
            last_obs = episode_buffer.next_observations[-1]
            previous_reward = episode_buffer.rewards[-1]
            last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0)
            previous_reward_tensor = torch.tensor([previous_reward], dtype=torch.float32).unsqueeze(0)
            context = agent.get_context()
            next_value = target_agent.predict_value(obs_tensor=last_obs_tensor,
                                                    previous_reward_tensor=previous_reward_tensor,
                                                    goal_grid_code_tensor=goal_grid_code_tensor,
                                                    context=context).cpu().item()

        # Super critical!!
        optimizer.zero_grad()

        # Compute value and policy losses
        loss, info = agent.compute_loss(rewards=np.array(episode_buffer.rewards),
                                        dones=np.array(episode_buffer.dones),
                                        next_value=next_value,
                                        discount_factor=gamma,
                                        use_gae=use_gae,
                                        tau=tau,
                                        value_loss_coef=value_loss_coef,
                                        policy_loss_coef=policy_loss_coef,
                                        entropy_reg_coef=entropy_loss_coef,
                                        grid_layer_wreg_loss_coef=grid_layer_weight_reg_loss_coef)
        loss.backward()
        if use_MPI:
            mpi_pytorch.mpi_avg_grads(agent)

        # Optimize
        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        optimizer.step()

        # Log losses and info
        logger.store(**info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(agent.parameters(), target_agent.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
        if use_MPI:
            mpi_pytorch.sync_params(target_agent)

    # Prepare for interaction with environment
    start_time = time.time()

    # Main loop: collect experience in env and update/log each epoch
    total_steps = 0

    # Reset env
    obs = env.reset()
    reward = 0
    goal_grid_code_tensor = None

    # Reset episode stats
    episode_return = 0
    episode_length = 0

    for epoch in range(1, epochs + 1):
        agent.reset_for_training()
        epoch_history = EpisodeHistory()
        for t in range(steps_per_epoch):
            total_steps += 1

            # Get action from the model
            obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
            previous_reward_tensor = torch.tensor([reward], dtype=torch.float32).unsqueeze(0)
            action = agent.step(obs_tensor, previous_reward_tensor, goal_grid_code_tensor).squeeze(0)

            # Step the env
            obs2, reward, done, _ = env.step(action.detach().cpu().item())
            if render and mpi_tools.proc_id() == 0:
                env.render('human', view='top')
                time.sleep(1e-3)
            episode_return += reward
            episode_length += 1

            # Store transition to history
            epoch_history.store(observation=None, action=None, reward=reward, done=done, next_observation=obs2)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            obs = obs2

            # End of trajectory handling
            if done:
                if reward > 0:
                    goal_grid_code_tensor = agent.current_grid_code.detach()
                break

        update(epoch_history)

        # if done
        if epoch_history.dones[-1]:
            logger.store(EpRet=episode_return, EpLen=episode_length)
            # Reset env
            obs = env.reset()
            agent.reset()
            # Reset episode stats
            episode_return = 0
            episode_length = 0

        # End of epoch handling
        if epoch % log_every == 0:
            total_interactions = mpi_tools.mpi_sum(total_steps) if use_MPI else total_steps

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('Value', average_only=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossEntropy', average_only=True)
            logger.log_tabular('LossGridL2', average_only=True)
            logger.log_tabular('LossPIM', average_only=True)
            logger.log_tabular('TotalEnvInteracts', total_interactions)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

        # Test agent
        solved = False
        if epoch % test_every == 0:
            video_dir = pathlib.Path(logger.output_dir) / 'test_videos' / f'epoch-{epoch:d}'
            test_env_fn = lambda: Monitor(env_fn(), directory=video_dir)
            # Test the performance of the deterministic version of the agent.
            context = agent.get_context()
            agent.eval()
            episode_info = evaluate_agent(env_fn=test_env_fn,
                                          agent=agent,
                                          deterministic=deterministic,
                                          num_episodes=num_test_episodes,
                                          render=False,
                                          logger=test_logger)
            agent.train()
            agent.set_context(context)
            if solved_score is not None:
                solved = all(r >= solved_score for (t, r) in episode_info)

        # Save model
        if (epoch % save_every == 0) or (epoch == epochs) or solved:
            logger.save_state({'env': env})

        # Check environment is solved
        if solved:
            plog = lambda msg: logger.log(msg, color='green')
            plog("=" * 40)
            plog(f"ENVIRONMENT SOLVED!")
            plog("=" * 40)
            plog(f'    TotalEnvInteracts {total_steps}')
            plog(f'    Time {time.time() - start_time}')
            plog(f'    Epoch {epoch}')
            break

    torch.save(agent, str(logger.output_dir / 'agent.pt'))
    env.close()
コード例 #7
0
def vpg(env,
        actor_critic=MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_v_iters=80,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Vanilla Policy Gradient 
    (with GAE 0 for advantage estimation)
    Args:
        env : An environment that satisfies the OpenAI Gym API.
        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================
            The ``act`` method behaves the same as ``step`` but only returns ``a``.
            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================
            The ``v`` module's forward call should accept a batch of observations
            and return:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================
        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to VPG.
        seed (int): Seed for random number generators.
        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.
        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.
        gamma (float): Discount factor. (Always between 0 and 1.)
        pi_lr (float): Learning rate for policy optimizer.
        vf_lr (float): Learning rate for value function optimizer.
        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.
        max_ep_len (int): Maximum length of trajectory / episode / rollout.
        logger_kwargs (dict): Keyword args for EpochLogger.
        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.
    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n  # assumes Discrete space

    ac = actor_critic(env.observation_space, env.action_space)
    ac.to(device)

    # buffer size equals number of steps in an epoch
    buff = VPGBuffer(steps_per_epoch, gamma, obs_dim, act_dim)

    def compute_loss_pi(data):
        obs = torch.as_tensor(data.obs_buf, dtype=torch.float32, device=device)
        act = torch.as_tensor(data.act_buf, dtype=torch.int32, device=device)
        adv = torch.as_tensor(data.advantage_buf,
                              dtype=torch.float32,
                              device=device)
        logpa = ac.pi(obs, act)
        return -1 * (logpa * adv).mean()

    def compute_loss_v(data):
        obs = torch.as_tensor(data.obs_buf, dtype=torch.float32, device=device)
        rew2go = torch.as_tensor(data.rew2go_buf,
                                 dtype=torch.float32,
                                 device=device)
        values = ac.v(obs)
        return F.mse_loss(values, rew2go)

    pi_optimizer = torch.optim.Adam(ac.pi.parameters(), lr=pi_lr)
    v_optimizer = torch.optim.Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update_pi(data):
        pi_optimizer.zero_grad()
        pi_loss = compute_loss_pi(data)
        pi_loss.backward()
        pi_optimizer.step()

        logger.store(LossPi=pi_loss.item())
        #TODO: log policy entropy

    def update_v(data):
        for s in range(train_v_iters):
            v_optimizer.zero_grad()
            v_loss = compute_loss_v(data)
            v_loss.backward()
            v_optimizer.step()

            logger.store(LossV=v_loss.item())

    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    t = 0  # total environment interactions

    # Update policy once per epoch
    for epoch in range(epochs):
        for t_epoch in range(steps_per_epoch):
            t += 1
            a, v, logpa = ac.step(
                torch.as_tensor(o, dtype=torch.float32, device=device))
            o2, r, d, info = env.step(a.cpu().numpy())
            buff.store(o, a, v, r, logpa)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len == max_ep_len else d

            o = o2

            # If trajectory is finished, calculate rewards to go,
            # then calculate the Advantage.
            if d is True or (ep_len == max_ep_len) or (t_epoch + 1
                                                       == steps_per_epoch):
                buff.finish_trajectory()
                logger.store(
                    EpRet=ep_ret,
                    EpLen=ep_len,
                )

                o, ep_ret, ep_len = env.reset(), 0, 0

            # Calculate policy gradient when we've collected t_epoch time steps.
            if t_epoch + 1 == steps_per_epoch:
                pylogger.debug('*** epoch ***', epoch)
                pylogger.debug('*** t_epoch ***', t_epoch)
                pylogger.debug('values', buff.val_buf)
                pylogger.debug('rewards', buff.rew_buf)
                pylogger.debug('rew2go', buff.rew2go_buf)
                pylogger.debug('advantage', buff.advantage_buf)

                # Update the policy using policy gradient
                update_pi(buff)

                # Re-fit the value function on the MSE.  Note, this is
                # gradient descent starting from the previous parameters.
                update_v(buff)

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs):
            logger.save_state({'env': env},
                              None)  # note, this includes full model pickle

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('TotalEnvInteracts', t)
        logger.log_tabular('Time', time.time() - start_time)
        if hasattr(env, 'episode_id'):
            logger.log_tabular('EpisodeId', env.episode_id)

        # If a quantity has not been calculated/stored yet, do not log it.  This can
        # happen, e.g. if NN update length or episode length exceeds num steps in epoch.
        to_log = [{
            'key': 'LossV',
            'average_only': True
        }, {
            'key': 'LossPi',
            'average_only': True
        }, {
            'key': 'EpRet',
            'with_min_and_max': True
        }, {
            'key': 'EpLen',
            'average_only': True
        }, {
            'key': 'RawRet',
            'with_min_and_max': True
        }, {
            'key': 'RawLen',
            'average_only': True
        }]

        for log_tabular_kwargs in to_log:
            key = log_tabular_kwargs['key']
            if key in logger.epoch_dict and len(logger.epoch_dict[key]) > 0:
                logger.log_tabular(**log_tabular_kwargs)

        wandb.log(logger.log_current_row, step=epoch)
        logger.dump_tabular()

        # reset buffer
        buff = VPGBuffer(steps_per_epoch, gamma, obs_dim, act_dim)

    # Save final model as a state dict
    state = {
        'epoch': epoch,
        'pi_state_dict': ac.pi.state_dict(),
        'v_state_dict': ac.v.state_dict(),
        'pi_optimizer': pi_optimizer.state_dict(),
        'v_optimizer': v_optimizer.state_dict(),
    }
    # hack for wandb: should output the model in the wandb.run.dir to avoid
    # problems syncing the model in the cloud with wandb's files
    state_fname = os.path.join(logger_kwargs['output_dir'], f"state_dict.pt")
    torch.save(state, state_fname)
    wandb.save(state_fname)
    pylogger.info(f"Saved state dict to {state_fname}")
    env.close()
コード例 #8
0
class VPG:
    """
	VPG w/ GAE-Lambda
	"""
    def __init__(self,
                 env_maker: Callable,
                 ac_maker=core.MLPActorCritic,
                 ac_kwargs={},
                 seed: int = 0,
                 epochs: int = 50,
                 steps_per_epoch: int = 4000,
                 gamma: float = 0.99,
                 actor_lr: float = 3e-4,
                 critic_lr: float = 1e-3,
                 num_iter_train_critic: int = 80,
                 lam: float = 0.97,
                 max_episode_len: int = 1000,
                 logger_kwargs=dict(),
                 save_freq: int = 10):
        # Special function to avoid certain slowdowns from PyTorch + MPI combo.
        setup_pytorch_for_mpi()
        # Set up logger and save configuration
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        # Random seed
        seed += 10000 * proc_id()
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.epochs = epochs
        self.steps_per_epoch = steps_per_epoch
        self.num_iter_train_critic = num_iter_train_critic
        self.max_episode_len = max_episode_len
        self.save_freq = save_freq

        # make env
        self.env = env_maker()
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape

        # make actor-critic
        self.ac = ac_maker(self.env.observation_space, self.env.action_space,
                           **ac_kwargs)

        # make buffer
        self.local_steps_per_epoch = int(steps_per_epoch / num_procs())
        self.buffer = Buffer(self.obs_dim, self.act_dim,
                             self.local_steps_per_epoch, gamma, lam)

        # make optimizers
        self.actor_optimizer = Adam(self.ac.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = Adam(self.ac.critic.parameters(), lr=critic_lr)

        # Sync params across processes
        sync_params(self.ac)
        # Count variables
        var_counts = tuple(
            core.count_vars(module)
            for module in [self.ac.actor, self.ac.critic])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                        var_counts)
        # Set up model saving
        self.logger.setup_pytorch_saver(self.ac)

    def compute_actor_loss(self, data):
        obs, act, adv, logprob_old = data['obs'], data['act'], data[
            'adv'], data['logprob']

        # policy loss
        pi, logprob = self.ac.actor(obs, act)
        loss_actor = -(logprob * adv).mean()

        # extra info
        approx_kl = (logprob_old - logprob).mean().item()
        entropy = pi.entropy().mean().item()
        pi_info = dict(kl=approx_kl, entropy=entropy)

        return loss_actor, pi_info

    def compute_critic_loss(self, data):
        obs, ret = data['obs'], data['ret']
        return ((self.ac.critic(obs) - ret)**2).mean()

    def update(self):
        data = self.buffer.get()

        actor_loss_old, actor_info_old = self.compute_actor_loss(data)
        actor_loss_old = actor_loss_old.item()
        critic_loss_old = self.compute_critic_loss(data).item()

        # train policy
        self.actor_optimizer.zero_grad()
        actor_loss, actor_info = self.compute_actor_loss(data)
        actor_loss.backward()
        mpi_avg_grads(self.ac.actor)
        self.actor_optimizer.step()

        # train critic
        for i in range(self.num_iter_train_critic):
            self.critic_optimizer.zero_grad()
            critic_loss = self.compute_critic_loss(data)
            critic_loss.backward()
            mpi_avg_grads(self.ac.critic)
            self.critic_optimizer.step()

        #log
        kl, entropy = actor_info['kl'], actor_info['entropy']
        self.logger.store(LossPi=actor_loss_old,
                          LossV=critic_loss_old,
                          KL=kl,
                          Entropy=entropy,
                          DeltaLossV=(critic_loss.item() - critic_loss_old),
                          DeltaLossPi=(actor_loss.item() - actor_loss_old))

    def train(self):
        start_time = time.time()
        obs, episode_ret, episode_len = self.env.reset(), 0, 0

        for epoch in range(self.epochs):
            for t in range(self.local_steps_per_epoch):
                act, v, logprob = self.ac.step(
                    torch.as_tensor(obs, dtype=torch.float32))
                # print(f"act: {act}")
                # print(f"v: {v}")
                # print(f"logprob: {logprob}")

                obs_next, reward, done, _ = self.env.step(act)
                episode_ret += reward
                episode_len += 1

                self.buffer.store(obs, act, reward, v, logprob)
                self.logger.store(VVals=v)

                obs = obs_next

                # episode end/timeout logic
                timeout = (episode_len == self.max_episode_len)
                terminal = (done or timeout)
                epoch_ended = (t == self.local_steps_per_epoch - 1)

                if terminal or epoch_ended:
                    if epoch_ended and not terminal:
                        # print(f"Warning: trajectory cut off by epoch at {episode_len} steps")
                        pass
                    if timeout or epoch_ended:
                        _, v, _ = self.ac.step(
                            torch.as_tensor(obs, dtype=torch.float32))
                    else:
                        v = 0
                    self.buffer.finish_path(v)
                    if terminal:
                        self.logger.store(EpRet=episode_ret, EpLen=episode_len)
                    obs, episode_ret, episode_len = self.env.reset(), 0, 0

            if (epoch % self.save_freq == 0) or (epoch == self.epochs - 1):
                self.logger.save_state({"env": self.env}, None)

            self.update()

            # Log info about epoch
            self.logger.log_tabular('Epoch', epoch)
            self.logger.log_tabular('EpRet', with_min_and_max=True)
            self.logger.log_tabular('EpLen', average_only=True)
            self.logger.log_tabular('VVals', with_min_and_max=True)
            self.logger.log_tabular('TotalEnvInteracts',
                                    (epoch + 1) * self.steps_per_epoch)
            self.logger.log_tabular('LossPi', average_only=True)
            self.logger.log_tabular('LossV', average_only=True)
            self.logger.log_tabular('DeltaLossPi', average_only=True)
            self.logger.log_tabular('DeltaLossV', average_only=True)
            self.logger.log_tabular('Entropy', average_only=True)
            self.logger.log_tabular('KL', average_only=True)
            self.logger.log_tabular('Time', time.time() - start_time)
            self.logger.dump_tabular()
コード例 #9
0
def sqn(env_fn,
        env_init,
        ego_agent,
        opp_agent,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-2,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        update_after=4000,
        update_every=1,
        num_test_episodes=10,
        max_ep_len=4000,
        logger_kwargs=dict(),
        save_freq=1,
        lr_period=0.7):
    """
    Soft Q-Network, based on SAC and clipped Double Q-learning


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, alpha,
                      **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    if isinstance(env.action_space, Box):
        a_dim = act_dim
    elif isinstance(env.action_space, Discrete):
        a_dim = 1

    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=a_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t q1: %d, \t q2: %d\n' % var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            v1 = ac.q1.values(o2)
            v2 = ac.q2.values(o2)
            a2, logp_a2 = ac.pi(v1 + v2, action_mask=ego_agent.aval_paths)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            #Unsqueeze adds another dim, necessary to be column vectors
            backup = r.unsqueeze(1) + gamma * (1 - d).unsqueeze(1) * (
                q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, lr_iter):
        # Update learning rate with cosine schedule
        lr = np.clip(
            0.005 * np.cos(np.pi * lr_iter / (total_steps * lr_period)) +
            0.00501, 1e-2, 1e-5)
        q_optimizer.param_groups[0]['lr'] = lr
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, action_mask, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32), action_mask,
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            d, ep_ret, ep_len = False, 0, 0
            init_positions = np.random.random_integers(0, 1)
            o = test_env.reset({
                'x': env_init['initial_x'][init_positions],
                'y': env_init['initial_y'],
                'theta': env_init['initial_theta']
            })
            #Convert o to RL obs
            RLobs = ego_agent.process_obs(o)

            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                a = get_action(RLobs,
                               action_mask=ego_agent.aval_paths,
                               deterministic=True)

                #RL action to drive control actions
                ego_speed, ego_steer, a = ego_agent.plan(o, a)
                #Opponent decision
                opp_speed, opp_steer = opp_agent.plan(o)

                action = {
                    'ego_idx': 0,
                    'speed': [ego_speed, opp_speed],
                    'steer': [ego_steer, opp_steer]
                }

                o, r, d, _ = test_env.step(action)
                #Convert o to RL obs
                RLobs = ego_agent.process_obs(o)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    init_positions = np.random.random_integers(0, 1)
    o, ep_ret, ep_len = env.reset({
        'x': env_init['initial_x'][init_positions],
        'y': env_init['initial_y'],
        'theta': env_init['initial_theta']
    }), 0, 0
    #Convert o to RL obs
    RLobs = ego_agent.process_obs(o)

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(RLobs,
                           action_mask=ego_agent.aval_paths,
                           deterministic=False)
        else:
            try:
                a = random.choice(tuple(ego_agent.aval_paths))
            except:  #happens when there are no paths available
                a = 15

        #RL action to drive control actions
        ego_speed, ego_steer, a = ego_agent.plan(o, a)
        #Opponent decision
        opp_speed, opp_steer = opp_agent.plan(o)
        action = {
            'ego_idx': 0,
            'speed': [ego_speed, opp_speed],
            'steer': [ego_steer, opp_steer]
        }

        # Step the env
        o2, r, d, _ = env.step(action)
        ep_ret += r
        ep_len += 1

        #Convert o2 to RLobs2
        RLobs2 = ego_agent.process_obs(o2)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        # replay_buffer.store(o, a, r, o2, d)
        replay_buffer.store(RLobs, a, r, RLobs2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        RLobs = RLobs2
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            init_positions = np.random.random_integers(0, 1)
            o, ep_ret, ep_len = env.reset({
                'x':
                env_init['initial_x'][init_positions],
                'y':
                env_init['initial_y'],
                'theta':
                env_init['initial_theta']
            }), 0, 0
            #Convert o to RL obs
            RLobs = ego_agent.process_obs(o)

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                #Cosine learning rate schedule
                if t < total_steps * (1 - lr_period):
                    lr_iter = 0
                else:
                    lr_iter = t - total_steps * (1 - lr_period)

                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, lr_iter=lr_iter)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                if epoch == epochs:
                    logger.save_state({'env': env}, None)
                else:
                    #SpinningUp saving style
                    logger.save_state({'env': env}, epoch)
                    #Standard pytorch way of saving
                    fpath = logger_kwargs['output_dir'] + '/state_dict/'
                    os.makedirs(fpath, exist_ok=True)
                    torch.save(ac.state_dict(), fpath + 'model%d.pt' % epoch)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #10
0
def dqn(env,
        actor_critic=MLPCritic,
        replay_size=500,
        seed=0,
        steps_per_epoch=3000,
        epochs=5,
        gamma=0.99,
        lr=0.00025,
        batch_size=32,
        start_steps=100,
        update_after=50,
        update_every=5,
        epsilon_start=1.0,
        epsilon_end=0.1,
        epsilon_decay_steps=1e6,
        target_update_every=1000,
        num_test_episodes=10,
        max_ep_len=200,
        record_video=False,
        record_video_every=100,
        save_freq=50,
        wandb_model_name=None,
        wandb_restore_run_path=None):
    """
    Args:
        env : An environment that satisfies the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act``
            method and a ``q`` module.
            The ``act`` method module should accept batches of
            observations as inputs, and ``q`` should accept a batch
            of observations and a batch of actions as inputs. When called,
            ``act`` and ``q`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        global_steps (int): Number of steps / frames for training (should be
            greater than update_after!)

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        lr (float): Learning rate.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates.

        epsilon_start (float): Chance to sample a random action when taking an action.
            Epsilon is decayed over time and this is the start value

        epsilon_end (float): The final minimum value of epsilon after decaying is done.

        epsilon_decay_steps (int): Number of steps over which to linearly decrement from
            epsilon_start to epsilon_end.

        target_update_every (int): Number of steps between updating target network
            parameters, i.e. resetting Q_hat to Q.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout. (Imposed by
           the environment.)

        record_video (bool): Record a video

        record_video_every (int): Record a video every N episodes

        save_freq (int): How often (in terms of gap between epochs) to save
            the current model (value function).

        wandb_model_name (str): (optional) if not None, use a pretrained serialized torch
            model stored in wandb

        wandb_restore_run_path (str): (optional) if wandb_model_name is specified, then
            this should specify path e.g. '$USER_NAME/$PROJECT_NAME/$RUN_ID'
    """
    logger_out_dir = wandb.run.dir
    logger = EpochLogger(exp_name='dqn', output_dir=logger_out_dir)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n  # assumes Discrete space

    if wandb_model_name is not None:
        # note, we can't use load_state_dict.  The entire model
        # was serialized by EpochLogger's save_state rather
        # than just its weights
        assert wandb_restore_run_path is not None
        ac = torch.load(wandb.restore(wandb_model_name,
                                      run_path=wandb_restore_run_path).name,
                        map_location=device)
    else:
        # Create critic module and network
        ac = actor_critic(env.observation_space, env.action_space)

    # Set target Q-network parameters theta_tar = theta
    target_q_network = deepcopy(ac.q)

    if torch.cuda.device_count() > 1 and wandb_model_name is None:
        # hack: last post in this thread https://discuss.pytorch.org/t/bug-in-dataparallel-only-works-if-the-dataset-device-is-cuda-0/28634/24
        # advises skipping DataParallel on a pretrained model
        ac.q = nn.DataParallel(ac.q)
        target_q_network = nn.DataParallel(target_q_network)

    ac.to(device)
    target_q_network.to(device)

    # Freeze target network w.r.t. optimizers
    for p in target_q_network.parameters():
        p.requires_grad = False

    # function to compute Q-loss
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        # Pick out q-values associated with / indexed by the action that was taken
        # for that observation: https://pytorch.org/docs/stable/torch.html#torch.gather.
        # Note index must be of type LongTensor.
        q = torch.gather(ac.q(o), dim=1, index=a.view(-1, 1).long())

        # Bellman backup for Q function
        with torch.no_grad():
            # Targets come from frozen target Q-network
            q_target = torch.max(target_q_network(o2), dim=1).values
            backup = r + (1 - d) * gamma * q_target

        # MSE loss against Bellman backup
        # loss_q = ((q - backup)**2).mean()
        loss_q = F.smooth_l1_loss(q[:, 0], backup).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.detach().cpu().numpy())

        return loss_q, loss_info

    # Set up optimizer for Q-function
    q_optimizer = torch.optim.Adam(ac.q.parameters(), lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    # function to update parameters in Q
    def update(data):
        q_optimizer.zero_grad()
        loss, loss_info = compute_loss_q(data)
        loss.backward()
        q_optimizer.step()

        logger.store(LossQ=loss.item(), **loss_info)

    def get_action(o, epsilon):
        # greedy epsilon strategy
        if np.random.sample() < epsilon:
            a = env.action_space.sample()
        else:
            a = ac.act(torch.as_tensor(o, dtype=torch.float32, device=device))
        return a

    # main loop: collect experience in env

    # Initialize experience replay buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    epsilon_decrement = (epsilon_start - epsilon_end) / epsilon_decay_steps
    epsilon = epsilon_start
    o, ep_ret, ep_len = env.reset(), 0, 0

    for t in range(total_steps):

        if t > start_steps and epsilon > epsilon_end:
            # linearly reduce epsilon
            epsilon -= epsilon_decrement

        if t > start_steps:
            # epsilon greedy
            a = get_action(o, epsilon)
        else:
            # randomly sample for better exploration before start_steps
            a = env.action_space.sample()

        # Step the env
        o2, r, d, info = env.step(a)
        # TODO: clip rewards b/w -1 and 1
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store transition to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Update the most recent observation.
        o = o2

        # End of episode handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)

            # End of multi-life game handling
            lives = info.get('ale.lives')
            if lives is not None and lives == 0:
                # Assumes env has been wrapped by Monitor.
                logger.store(RawRet=env.get_episode_rewards()[-1])
                logger.store(RawLen=env.get_episode_lengths()[-1])

            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t > update_after and t % update_every == 0:
            minibatch = replay_buffer.sample_batch(batch_size)
            update(data=minibatch)

        # Refresh target Q network
        if t % target_update_every == 0:
            target_q_network.load_state_dict(ac.q.state_dict())
            for p in target_q_network.parameters():
                p.requires_grad = False

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0 and (t + 1) > start_steps and (
                t + 1) > update_after:
            epoch = (t + 1) // steps_per_epoch

            print(f"epsilon: {epsilon}")

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state(
                    {'env': env},
                    None)  # note, this includes full model pickle

                # Save the model parameters to wandb every save_freq epoch
                # instead of waiting till the end
                state = {
                    'epoch': epoch,
                    'ac_state_dict': ac.state_dict(),
                    'ac_q_state_dict':
                    ac.q.state_dict(),  # not sure this is necessary
                    'q_optimizer': q_optimizer.state_dict(),
                    'q_loss': logger.epoch_dict['LossQ'][-1],
                }
                # hack for wandb: should output the model in the wandb.run.dir to avoid
                # problems syncing the model in the cloud with wandb's files
                state_fname = os.path.join(wandb.run.dir, f"state_dict.pt")
                torch.save(state, state_fname)
                wandb.save(state_fname)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Time', time.time() - start_time)
            logger.log_tabular('Epsilon', epsilon)
            if hasattr(env, 'episode_id'):
                logger.log_tabular('EpisodeId', env.episode_id)

            # If a quantity has not been calculated/stored yet, do not log it.  This can
            # happen, e.g. if NN update length or episode length exceeds num steps in epoch.
            to_log = [{
                'key': 'QVals',
                'with_min_and_max': True
            }, {
                'key': 'LossQ',
                'average_only': True
            }, {
                'key': 'EpRet',
                'with_min_and_max': True
            }, {
                'key': 'EpLen',
                'average_only': True
            }, {
                'key': 'RawRet',
                'with_min_and_max': True
            }, {
                'key': 'RawLen',
                'average_only': True
            }]

            for log_tabular_kwargs in to_log:
                key = log_tabular_kwargs['key']
                if key in logger.epoch_dict and len(
                        logger.epoch_dict[key]) > 0:
                    logger.log_tabular(**log_tabular_kwargs)

            wandb.log(logger.log_current_row, step=epoch)
            logger.dump_tabular()

    env.close()
コード例 #11
0
def my_vpg(env_fn, seed=0, steps_per_epoch=4000, epochs=50, max_ep_len=1000,
        hidden_sizes=[32], lr=1e-2,
        logger_kwargs=dict(), save_freq=10):
    """
    My VPG implementation

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

    """

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    print("env.observation_space", env.observation_space)
    print("env.observation_space.shape", env.observation_space.shape)
    print("env.action_space", env.action_space)

    # Prepare for interaction with environment
    start_time = time.time()

    # Instantiate policy
    if isinstance(env.action_space, gym.spaces.Box):
        policy = GaussianPolicy(env.action_space, env.observation_space, hidden_sizes)
    elif isinstance(env.action_space, gym.spaces.Discrete):
        policy = CategoricalPolicy(env.action_space, env.observation_space, hidden_sizes)
    policy_optimizer = torch.optim.Adam(policy.actor_net.parameters(), lr=lr)

    # value_net = mlp(sizes = [obs_dim] + hidden_sizes + [1])
    # value_optimizer = torch.optim.Adam(value_net.parameters(), lr=lr)
    # print("value_net")
    # print(value_net)
    # def get_value(o):
    #     return value_net(torch.as_tensor(o, dtype=torch.float32))

    # Set up model saving
    logger.setup_pytorch_saver(policy)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        o, ep_ret, ep_len = env.reset(), 0, 0

        buffer = Buffer()
        for t in range(steps_per_epoch):
            a = policy.act(torch.tensor(o, dtype=float).unsqueeze(0))
            a = a.numpy()[0] # Remove batch dimension

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            buffer.append(o, a, r, next_o)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t==steps_per_epoch-1

            if terminal or epoch_ended:
                if epoch_ended and not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                buffer.fill_episode_returns(ep_ret)
                o, ep_ret, ep_len = env.reset(), 0, 0
        
        # Update
        o, a, r, next_o, R = buffer.get()

        # baseline = get_value(o)
        # R = r + get_value(next_o)
        # advantage = R - baseline

        # # Value function update
        # value_optimizer.zero_grad()
        # criterion = torch.nn.MSELoss()
        # value_loss = criterion(R, baseline)
        # value_loss.backward()
        # value_optimizer.step()

        # Policy function update
        policy_optimizer.zero_grad()
        logp_a = policy.get_logp(o, a)
        policy_loss = -(logp_a * R).mean()
        policy_loss.backward()
        policy_optimizer.step()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)
コード例 #12
0
ファイル: dqn.py プロジェクト: LiuShuai26/spinningup
def dqn(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
        update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000,
        logger_kwargs=dict(), save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.n

    print(obs_dim, act_dim)

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t q1: %d, \t q2: %d\n' % var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = ac.q1(o)
        q2 = ac.q2(o)

        q1_a = q1.gather(1, a.long()).squeeze(-1)
        q2_a = q2.gather(1, a.long()).squeeze(-1)

        # Bellman backup for Q functions
        with torch.no_grad():

            # Target Q-values
            q1_targ = torch.max(ac_targ.q1(o2), dim=1)[0]
            q2_targ = torch.max(ac_targ.q2(o2), dim=1)[0]
            q_targ = torch.min(q1_targ, q2_targ)
            backup = r + gamma * (1 - d) * q_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1_a - backup) ** 2).mean()
        loss_q2 = ((q2_a - backup) ** 2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(),
                      Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up optimizers for policy and q-function
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32),
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps or np.random.random() > 0.05:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #13
0
ファイル: dqn.py プロジェクト: dennischenfeng/spinningup
def dqn(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100,
        replay_size=int(1e5), batch_size=100, gamma=0.99, q_lr=1e-4, start_steps=10000,
        update_after=1000, update_targ_every=50, num_test_episodes=10,
        max_ep_len=1000, epsilon=0.01, epsilon_decay=0.99995, logger_kwargs=dict(), writer_kwargs=dict(), save_freq=1):
    """
    DQN (Deep Q-Networks). Reproduce the original paper from Minh et al.
    """
    # Instantiate env
    env = env_fn()
    test_env = env_fn()
    # TODO: might have to assert discrete, or otherwise take only first index of shape or so
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Set up actor (pi) & critic (Q), and data buffer
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    q_targ = copy.deepcopy(ac.q)
    for p in q_targ.parameters():
        p.requires_grad = False
    q_optimizer = torch.optim.Adam(ac.q.parameters(), lr=q_lr)

    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Set RNG seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    env.action_space.seed(seed)

    # Set up logging
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    logger.setup_pytorch_saver(ac)
    writer = SummaryWriter(**writer_kwargs)

    start_time = time.time()
    total_steps = epochs * steps_per_epoch
    o = env.reset()
    op = preprocess_obs(o)  # "op" = "observation_preprocessed"
    ep_return = 0  # episode return, counter
    ep_length = 0  # episode length, counter
    for step in range(total_steps):
        # Take an env step, then store data in replay buffer
        if step > start_steps:
            ac.pi.epsilon = max(epsilon, epsilon_decay**step)
            a = ac.act(torch.as_tensor(op, dtype=torch.float32))
        else:
            a = env.action_space.sample()
        o2, r, d, _ = env.step(a)
        o2p = preprocess_obs(o2)
        replay_buffer.store(op, a, r, o2p, d)

        # TODO: does DQN paper say to do 1 GD update with mean of minibatch, or many 1-data-point updates?
        # Sample a random batch from replay buffer and perform one GD step
        q_optimizer.zero_grad()
        batch_data = replay_buffer.sample_batch(batch_size)
        loss_q = compute_loss_q(batch_data, ac, q_targ, gamma)
        loss_q.backward()
        q_optimizer.step()

        # Update target network every so often
        if (step % update_targ_every == 0) and (step >= update_after):
            q_targ = copy.deepcopy(ac.q)
            for p in q_targ.parameters():
                p.requires_grad = False

        # Keep track of episode return and length (for logging purposes)
        ep_return += r
        ep_length += 1

        # If episode done, reset env
        if d:
            o = env.reset()
            op = preprocess_obs(o)
            logger.store(EpRet=ep_return, EpLen=ep_length)
            ep_return = 0
            ep_length = 0
        else:
            op = o2p

        # TODO: confirm: no need for test set if test agent & env are same as training agent & env (e.g. would need
        #  test set if algo added noise to training but not test
        # If epoch end, then do a test to see average return thus far
        if step % steps_per_epoch == steps_per_epoch - 1:
            for ep_i in range(num_test_episodes):
                # turn off epsilon exploration:
                old_epsilon = ac.pi.epsilon
                ac.pi.epsilon = 0

                test_ep_return, test_ep_length = run_test_episode(test_env, ac)
                logger.store(TestEpRet=test_ep_return, TestEpLen=test_ep_length)

                # turn it back on
                ac.pi.epsilon = old_epsilon

        # If epoch end, save models and show logged data
        if step % steps_per_epoch == steps_per_epoch - 1:
            epoch_i = int(step // steps_per_epoch)

            writer.add_scalar("EpRet_mean", logger.get_stats("EpRet")[0], epoch_i)  # first item in `get_stats` is mean
            writer.add_scalar("EpRet_std", logger.get_stats("EpRet")[1], epoch_i)  # 2nd item in `get_stats` is std
            writer.add_scalar("TestEpRet_mean", logger.get_stats("TestEpRet")[0], epoch_i)
            writer.add_scalar("TestEpRet_std", logger.get_stats("TestEpRet")[1], epoch_i)
            writer.add_scalar("epsilon", ac.pi.epsilon, epoch_i)

            logger.save_state({'env': env}, None)  # saves both ac and env
            logger.log_tabular("Epoch", epoch_i)
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TimeFromStart", time.time() - start_time)
            logger.dump_tabular()

    # Save model at end
    logger.save_state({'env': env}, None)
    writer.close()
コード例 #14
0
def cegl(env_fn, ac_kwargs=dict(), seed=0,
         steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
         polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000,
         update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000,
         logger_kwargs=dict(), save_freq=1, eps=0.4, n_explore=32, update_factor=1, device='cuda',
         architecture='mlp', sample='on_policy', method='egl'):
    if architecture == 'mlp':
        actor_critic = core.MLPActorCritic
    elif architecture == 'spline':
        actor_critic = core.SplineActorCritic
    else:
        raise NotImplementedError

    device = torch.device(device)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2, ac.geps])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t geps: %d\n' % var_counts)

    n_samples = 100
    cmin = 0.25
    cmax = 1.75
    greed = 0.01
    rand = 0.01

    def max_reroute(o):

        b, _ = o.shape
        o = repeat_and_reshape(o, n_samples)
        with torch.no_grad():
            ai, _ = ac.pi(o)

            q1 = ac.q1(o, ai)
            q2 = ac.q2(o, ai)
            qi = torch.min(q1, q2).unsqueeze(-1)

        qi = qi.view(n_samples, b, 1)
        ai = ai.view(n_samples, b, act_dim)
        rank = torch.argsort(torch.argsort(qi, dim=0, descending=True), dim=0, descending=False)
        w = cmin * torch.ones_like(ai)
        m = int((1 - cmin) * n_samples / (cmax - cmin))

        w += (cmax - cmin) * (rank < m).float()
        w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float()

        w -= greed
        w += greed * n_samples * (rank == 0).float()

        w = w * (1 - rand) + rand

        w = w / w.sum(dim=0, keepdim=True)

        prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0))

        a = torch.gather(ai.permute(1, 2, 0), 2, prob.sample().unsqueeze(2)).squeeze(2)

        return a, (ai, w.mean(-1))

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup) ** 2).mean()
        loss_q2 = ((q2 - backup) ** 2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # Set up function for computing EGL mean-gradient-losses
    def compute_loss_g(data):

        o = data['obs']

        # Bellman backup for Q functions
        with torch.no_grad():
            a1, _ = ac.pi(o)

            std = ac.pi.distribution.scale
            std = torch.clamp_max(std, max=eps)

            a2 = explore(a1, n_explore, std)

            a2 = a2.reshape(n_explore * len(o), act_dim)
            o_expand = repeat_and_reshape(o, n_explore)

            # q_dither = ac.q1(o_expand, a2)
            # q_anchor = ac.q1(o, a1)

            q1_dither = ac.q1(o_expand, a2)
            q2_dither = ac.q2(o_expand, a2)

            q_dither = torch.min(q1_dither, q2_dither)

            q1_anchor = ac.q1(o, a1)
            q2_anchor = ac.q2(o, a1)

            q_anchor = torch.min(q1_anchor, q2_anchor)

            q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1)

        geps = ac.geps.forward_tag(o, a1)
        geps = repeat_and_reshape(geps, n_explore)
        a1 = repeat_and_reshape(a1, n_explore)

        geps = (geps * (a2 - a1)).sum(-1)
        # mse loss against Bellman backup
        loss_g = F.smooth_l1_loss(geps, (q_dither - q_anchor), reduction='mean') * n_explore / eps / act_dim

        # delta = torch.norm(a2 - a1, dim=-1)
        # n = (a2 - a1) / delta.unsqueeze(1)
        # target = torch.clamp((q_dither - q_anchor) / delta, min=-100, max=100)
        # geps = (geps * n).sum(-1)
        # # mse loss against Bellman backup
        # loss_g = F.smooth_l1_loss(geps, target, reduction='mean')

        # Useful info for logging
        g_info = dict(GVals=geps.flatten().detach().cpu().numpy(), GEps=std.flatten().detach().cpu().numpy())

        # return loss_g, g_info, {'delta': delta, 'n': n, 'a1': a1, 'a2': a2, 'target': target,
        #                         'geps': geps, 'q_dither': q_dither, 'q_anchor': q_anchor}

        return loss_g, g_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)

        geps_pi = ac.geps.forward_tag(o, pi, no_grad=True)

        if method == 'egl':
            # Entropy-regularized policy loss
            loss_pi = (alpha * logp_pi - (geps_pi * pi).sum(-1)).mean()

        elif method == 'sac':
            q1_pi = ac.q1(o, pi)
            q2_pi = ac.q2(o, pi)
            q_pi = torch.min(q1_pi, q2_pi)

            # Entropy-regularized policy loss
            loss_pi = (alpha * logp_pi - q_pi).mean()

        else:
            raise NotImplementedError

        beta = autograd.Variable(pi.detach().clone(), requires_grad=True)
        q1_pi = ac.q1(o, beta)
        q2_pi = ac.q2(o, beta)
        qa = torch.min(q1_pi, q2_pi).unsqueeze(-1)

        grad_q = autograd.grad(outputs=qa, inputs=beta, grad_outputs=torch.cuda.FloatTensor(qa.size()).fill_(1.),
                               create_graph=False, retain_graph=False, only_inputs=True)[0]

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().cpu().numpy(),
                       GradGAmp=torch.norm(geps_pi, dim=-1).detach().cpu().numpy(),
                       GradQAmp=torch.norm(grad_q, dim=-1).detach().cpu().numpy(),
                       GradDelta=torch.norm(geps_pi - grad_q, dim=-1).detach().cpu().numpy(),
                       GradSim=F.cosine_similarity(geps_pi, grad_q, dim=-1).detach().cpu().numpy(),
                       ActionsNorm=torch.norm(pi, dim=-1).detach().cpu().numpy(),
                       ActionsAbs=torch.abs(pi).flatten().detach().cpu().numpy(), )

        return loss_pi, pi_info

    if architecture == 'mlp':
        # Set up optimizers for policy and q-function
        pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
        q_optimizer = Adam(q_params, lr=lr)
        g_optimizer = Adam(ac.geps.parameters(), lr=lr)
    elif architecture == 'spline':
        # Set up optimizers for policy and q-function
        pi_optimizer = SparseDenseAdamOptimizer(ac.pi, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr})
        q_optimizer = SparseDenseAdamOptimizer([ac.q1, ac.q2], dense_args={'lr': lr}, sparse_args={'lr': 10 * lr})
        g_optimizer = SparseDenseAdamOptimizer(ac.geps, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr})
    else:
        raise NotImplementedError

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):

        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()

        # if any([torch.isnan(p.grad).any() for p in ac.parameters() if p.grad is not None]):
        #     print('nan')

        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Next run one gradient descent step for the mean-gradient
        g_optimizer.zero_grad()
        # loss_g, g_info, g_dict = compute_loss_g(data)
        loss_g, g_info = compute_loss_g(data)
        loss_g.backward()

        # if any([torch.isnan(p.grad).any() for p in ac.parameters() if p.grad is not None]):
        #     # print('nan')
        #     print(len(g_dict))

        g_optimizer.step()

        # Record things
        logger.store(LossG=loss_g.item(), **g_info)

        # Freeze Q-networks so you don't waste computational effort 
        # computing gradients for them during the policy learning step.
        for p in ac.geps.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()

        # if any([torch.isnan(p.grad).any() for p in ac.parameters() if p.grad is not None]):
        #     print('nan')

        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in ac.geps.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action_on_policy(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic)

    def get_action_rbi(o, deterministic=False):
        o = torch.as_tensor(o, dtype=torch.float32, device=device)
        if deterministic:
            a = ac.act(o, deterministic)
        else:
            o = o.unsqueeze(0)
            a, _ = max_reroute(o)
            a = a.flatten().cpu().numpy()
        return a

    if sample == 'on_policy':
        get_action = get_action_on_policy
    elif sample == 'rbi':
        get_action = get_action_rbi
    else:
        raise NotImplementedError

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in tqdm(range(total_steps)):

        # eps = math.exp(math.log(1.) + (math.log(0.03) - math.log(1.)) * math.sin(2 * math.pi * t / 200e3))
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy. 
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every * update_factor):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)

            logger.log_tabular('GVals', with_min_and_max=True)
            logger.log_tabular('LossG', with_min_and_max=True)
            logger.log_tabular('GradGAmp', with_min_and_max=True)
            logger.log_tabular('GradQAmp', with_min_and_max=True)
            logger.log_tabular('GradDelta', with_min_and_max=True)
            logger.log_tabular('GradSim', with_min_and_max=True)
            logger.log_tabular('GEps', with_min_and_max=True)
            logger.log_tabular('ActionsNorm', with_min_and_max=True)
            logger.log_tabular('ActionsAbs', with_min_and_max=True)

            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #15
0
ファイル: ddpg.py プロジェクト: LinghengMeng/lstm_td3
def ddpg(env_name,
         partially_observable=False,
         pomdp_type='remove_velocity',
         flicker_prob=0.2,
         random_noise_sigma=0.1,
         random_sensor_missing_prob=0.1,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         update_after=1000,
         update_every=50,
         act_noise=0.1,
         num_test_episodes=10,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_name : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        partially_observable:

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    # Wrapper environment if using POMDP
    if partially_observable:
        env = POMDPWrapper(env_name, pomdp_type, flicker_prob,
                           random_noise_sigma, random_sensor_missing_prob)
        test_env = POMDPWrapper(env_name, pomdp_type, flicker_prob,
                                random_noise_sigma, random_sensor_missing_prob)
    else:
        env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    ac.to(device)
    ac_targ.to(device)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts)

    # Set up function for computing DDPG Q-loss
    def compute_loss_q(data, batch_hist, t):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        # batch_hist['pred_q_hist']
        # batch_hist['targ_q_hist']
        # batch_hist['targ_next_q_hist']
        # batch_hist['sampled_time_hist']

        q = ac.q(o, a)

        # Bellman backup for Q function
        with torch.no_grad():
            q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
            # if t < 50000:
            # Average over historically predicted q-values
            window_size = 10
            mean_targ_next_q_hist = []
            tuned_indicator = np.zeros(q_pi_targ.shape)
            batch_change_rate = []
            for i in range(len(batch_hist['targ_next_q_hist'])):
                tmp_batch_hist = np.asarray(batch_hist['targ_next_q_hist'][i])
                tmp_batch_hist = np.append(
                    tmp_batch_hist, q_pi_targ[i].item())  # add new prediction
                change_rate = tmp_batch_hist[1:] - tmp_batch_hist[:-1]

                if len(tmp_batch_hist) == 1:
                    batch_change_rate.append(None)
                else:
                    batch_change_rate.append(change_rate[-1])

            batch_change_rate = np.asarray(batch_change_rate).astype(float)
            not_nan_idxs = np.argwhere(~np.isnan(batch_change_rate))
            sorted_not_nan_idxs = np.argsort(
                batch_change_rate[not_nan_idxs.flatten()])
            threshold_percentile = 75  # 25, 50, 75
            if len(sorted_not_nan_idxs) != 0:
                threshold = np.percentile(
                    batch_change_rate[not_nan_idxs[sorted_not_nan_idxs]],
                    threshold_percentile)
                if threshold < 0:
                    threshold = 0
            else:
                threshold = 1
            # threshold = 1  # thresold=1 works for HalfCheetahBulletEnv-v0

            # New threshold
            for i in range(len(batch_hist['targ_next_q_hist'])):
                tmp_batch_hist = np.asarray(batch_hist['targ_next_q_hist'][i])
                tmp_batch_hist = np.append(
                    tmp_batch_hist, q_pi_targ[i].item())  # add new prediction
                change_rate = tmp_batch_hist[1:] - tmp_batch_hist[:-1]

                if len(tmp_batch_hist) == 1:
                    avg_window = tmp_batch_hist[-1]
                else:
                    if change_rate[-1] > threshold:
                        avg_window = tmp_batch_hist[-2] + threshold
                        # avg_window = tmp_batch_hist[-2]
                        tuned_indicator[i] = 1
                    else:
                        avg_window = tmp_batch_hist[-1]
                mean_targ_next_q_hist.append(avg_window)
            # print(batch_change_rate[not_nan_idxs[sorted_not_nan_idxs]])
            # import pdb; pdb.set_trace()

            # if t>10000:
            #     import pdb; pdb.set_trace()
            avg_q_pi_targ = torch.as_tensor(mean_targ_next_q_hist,
                                            dtype=torch.float32).to(device)

            # else:
            #     avg_q_pi_targ = q_pi_targ
            #     tuned_indicator = np.zeros(q_pi_targ.shape)
            backup = r + gamma * (1 - d) * avg_q_pi_targ
            # backup = r + gamma * (1 - d) * q_pi_targ
        # import pdb;
        # pdb.set_trace()

        # MSE loss against Bellman backup
        loss_q = ((q - backup)**2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.cpu().detach().numpy(),
                         TunedNum=tuned_indicator.sum(),
                         THLD=threshold)

        return loss_q, loss_info, q, backup, avg_q_pi_targ, tuned_indicator  # Crucial log shapped q_pi_targ to history

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q_pi = ac.q(o, ac.pi(o))
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, batch_hist, t):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info, q, backup, q_pi_targ, tuned_indicator = compute_loss_q(
            data, batch_hist, t)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in ac.q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in ac.q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging. (Common choice: 0.995)
        # # TODO: remove later
        # polyak = 0.4
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

        return q.cpu().detach().numpy(), backup.cpu().detach().numpy(
        ), q_pi_targ.cpu().detach().numpy(), tuned_indicator

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32).to(device))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                sample_type = 'pseudo_random'  # 'pseudo_random'  genuine_random
                batch, batch_hist, batch_idxs = replay_buffer.sample_batch(
                    batch_size, device=device, sample_type=sample_type)
                q, backup, q_pi_targ, tuned_indicator = update(
                    data=batch, batch_hist=batch_hist, t=t)
                replay_buffer.add_sample_hist(batch_idxs, q, backup, q_pi_targ,
                                              tuned_indicator, t)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # # Save model
            # fpath = osp.join(logger.output_dir, 'pyt_save')
            # os.makedirs(fpath, exist_ok=True)
            # context_fname = 'checkpoint-context-' + (
            #     'Step-%d' % t if t is not None else '') + '.pt'
            # context_fname = osp.join(fpath, context_fname)
            # if (epoch % save_freq == 0) or (epoch == epochs):
            #     logger.save_state({'env': env}, None)
            #     torch.save({'replay_buffer': replay_buffer}, context_fname)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('TunedNum', with_min_and_max=True)
            logger.log_tabular('THLD', with_min_and_max=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #16
0
def randwalk(env_fn,
             seed=0,
             steps_per_epoch=4000,
             epochs=50,
             max_ep_len=1000,
             logger_kwargs=dict(),
             save_freq=10):
    """
    Random Walk

    (This is simply a uniform random walk!)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

    """

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    policy = Policy(env.action_space)

    # Set up model saving
    logger.setup_pytorch_saver(policy)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(steps_per_epoch):
            # Pick a random action within the action space
            a = policy.act(o)

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)
コード例 #17
0
ファイル: mbfq.py プロジェクト: eladsar/spinningup
def mbfq(env_fn,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=2000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         lr=1e-3,
         alpha=0.2,
         batch_size=128,
         start_steps=10000,
         update_after=1000,
         update_every=50,
         num_test_episodes=10,
         max_ep_len=1000,
         max_ep_len_ppo=50,
         logger_kwargs=dict(),
         save_freq=1,
         update_factor=1,
         device='cuda',
         lam=0.97,
         steps_per_ppo_update=1000,
         n_ppo_updates=1,
         train_pi_iters=80,
         target_kl=0.01,
         clip_ratio=0.2):

    device = torch.device(device)

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    state_dim = {376: 144, 111: 64, 17: 12, 11: 8}[obs_dim[0]]

    # Create actor-critic module and target networks
    ac = core.MLPActorCritic(env.observation_space, env.action_space,
                             **ac_kwargs).to(device)

    model = core.FlowWorldModel(obs_dim[0], state_dim,
                                act_dim + int(act_dim % 2)).to(device)

    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 device=device)
    ppo_buffer = PPOBuffer(obs_dim,
                           act_dim,
                           steps_per_ppo_update,
                           gamma=gamma,
                           lam=lam,
                           device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, model])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d, \t model: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_model(data):

        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        if act_dim % 2:
            a = torch.cat([a, torch.zeros(len(a), 1, device=a.device)], dim=1)

        loss, info, _ = model(o, a, r, o2, d)

        return loss, info

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().cpu().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)
    model_optimizer = SparseDenseAdamOptimizer(model,
                                               dense_args={'lr': lr},
                                               sparse_args={'lr': 10 * lr})

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):

        loss_model, model_info = compute_loss_model(data)
        model_optimizer.zero_grad()
        loss_model.backward()
        core.clip_grad_norm(model.parameters(), 1000)
        model_optimizer.step()

        # Record things
        logger.store(LossModel=loss_model.item(), **model_info)

        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):

        # s = model.get_state(torch.as_tensor(o, dtype=torch.float32, device=device).unsqueeze(0), batch_size=batch_size).squeeze(0)
        # return ac.act(o, deterministic)
        return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device),
                      deterministic)

    # Set up function for computing PPO policy loss
    def ppo_compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        ac.pi(obs)
        logp = ac.pi.log_prob(act, desquash=True)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, cf=clipfrac)

        return loss_pi, pi_info

    def ppo_step(o):

        with torch.no_grad():
            o = torch.as_tensor(o, dtype=torch.float32, device=device)
            a, log_pi = ac_targ.pi(o)
            q1_pi = ac.q1(o, a)
            q2_pi = ac.q2(o, a)
            q_pi = torch.min(q1_pi, q2_pi)
            v = (alpha * log_pi - q_pi).squeeze(0).cpu().numpy()

        return a.squeeze(0).cpu().numpy(), v, log_pi.squeeze(0).cpu().numpy()

    def virtual_ppo():

        venv = VirtualEnv(replay_buffer, model)
        ac_targ.pi.load_state_dict(ac.pi.state_dict())

        # Main loop: collect experience in env and update/log each epoch

        for epoch in range(n_ppo_updates):

            o, ep_ret, ep_len = venv.reset(), 0, 0

            for t in tqdm(range(steps_per_ppo_update)):

                a, v, log_pi = ppo_step(o)

                next_o, r, d, _ = venv.step(a)
                ep_ret += r
                ep_len += 1

                # save and log
                ppo_buffer.store(o, a, r, v, log_pi)
                logger.store(VVals=v)

                # Update obs (critical!)
                o = next_o

                timeout = ep_len == max_ep_len_ppo
                terminal = d or timeout
                epoch_ended = t == steps_per_epoch - 1

                if terminal or epoch_ended:
                    if epoch_ended and not terminal:
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len,
                            flush=True)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    if timeout or epoch_ended:
                        _, v, _ = ppo_step(o)

                    else:
                        v = 0
                    ppo_buffer.finish_path(v)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(VirtualEpRet=ep_ret, VirtualEpLen=ep_len)
                    o, ep_ret, ep_len = env.reset(), 0, 0

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Perform PPO update!
            data = ppo_buffer.get()

            pi_l_old, pi_info_old = ppo_compute_loss_pi(data)
            pi_l_old = pi_l_old.item()

            # Train policy with multiple steps of gradient descent
            for i in range(train_pi_iters):

                loss_pi, pi_info = ppo_compute_loss_pi(data)
                # kl = mpi_avg(pi_info['kl'])
                kl = pi_info['kl']
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break

                pi_optimizer.zero_grad()
                loss_pi.backward()
                # mpi_avg_grads(ac.pi)  # average grads across MPI processes
                pi_optimizer.step()

            logger.store(StopIter=i)

            # Log changes from update
            kl, cf = pi_info['kl'], pi_info['cf']
            logger.store(LossPi=pi_l_old,
                         KL=kl,
                         ClipFrac=cf,
                         DeltaLossPi=(loss_pi.item() - pi_l_old))

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in tqdm(range(total_steps)):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every * update_factor):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            virtual_ppo()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VirtualEpRet', with_min_and_max=True)
            logger.log_tabular('VirtualEpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossModel', average_only=True)
            logger.log_tabular('reg', average_only=True)
            logger.log_tabular('rec', average_only=True)
            logger.log_tabular('loss_d', average_only=True)
            logger.log_tabular('loss_r', average_only=True)
            logger.log_tabular('kl', average_only=True)
            logger.log_tabular('prior_logprob', average_only=True)
            logger.log_tabular('log_det', average_only=True)
            logger.log_tabular('conditional_log_det', average_only=True)
            logger.log_tabular('conditional_logprob', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('ClipFrac', average_only=True)
            logger.log_tabular('StopIter', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #18
0
ファイル: ppo.py プロジェクト: deepdrive/spinningup
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10, resume=None,
        reinitialize_optimizer_on_resume=True, render=False, notes='',
        env_config=None, boost_explore=0, partial_net_load=False,
        num_inputs_to_add=0, episode_cull_ratio=0, try_rollouts=0,
        steps_per_try_rollout=0, take_worst_rollout=False, shift_advs_pct=0,
        **kwargs):
    """
    Proximal Policy Optimization (by clipping),

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`.

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        resume (str): Path to directory with simple_save model info
            you wish to resume from

        reinitialize_optimizer_on_resume: (bool) Whether to initialize
            training state in the optimizers, i.e. the individual learning
            rates for weights in Adam

        render: (bool) Whether to render the env during training. Useful for
            checking that resumption of training caused visual performance
            to carry over

        notes: (str) Experimental notes on what this run is testing

        env_config (dict): Environment configuration pass through

        boost_explore (float): Amount to increase std of actions in order to
        reinvigorate exploration.

        partial_net_load (bool): Whether to partially load the network when
        resuming. https://pytorch.org/tutorials/beginner/saving_loading_models.html#id4

        num_inputs_to_add (int): Number of new inputs to add, if resuming and
        partially loading a new network.

        episode_cull_ratio (float): Ratio of bad episodes to cull
        from epoch

        try_rollouts (int): Number of times to sample actions

        steps_per_try_rollout (int): Number of steps per attempted rollout

        take_worst_rollout (bool): Use worst rollout in training

        shift_advs_pct (float): Action should be better than this pct of actions
            to be considered advantageous.
    """
    config = deepcopy(locals())

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    import_custom_envs()

    # Instantiate environment
    env = env_fn()
    if hasattr(env.unwrapped, 'configure_env'):
        env.unwrapped.configure_env(env_config)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    num_agents = getattr(env, 'num_agents', 1)

    if hasattr(env.unwrapped, 'logger'):
        print('Logger set by environment')
        logger_kwargs['logger'] = env.unwrapped.logger

    logger = EpochLogger(**logger_kwargs)
    logger.add_key_stat('won')
    logger.add_key_stat('trip_pct')
    logger.add_key_stat('HorizonReturn')
    logger.save_config(config)

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space,
                      num_inputs_to_add=num_inputs_to_add, **ac_kwargs)

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Resume
    if resume is not None:
        ac, pi_optimizer, vf_optimizer = get_model_to_resume(
            resume, ac, pi_lr, vf_lr, reinitialize_optimizer_on_resume,
            actor_critic, partial_net_load, num_inputs_to_add)
        if num_inputs_to_add:
            add_inputs(ac, ac_kwargs, num_inputs_to_add)

    if boost_explore:
        boost_exploration(ac, boost_explore)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = ppo_buffer_factory(obs_dim, act_dim, local_steps_per_epoch, gamma,
                             lam, num_agents, shift_advs_pct,
                             cull_ratio=episode_cull_ratio)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Sync params across processes
    sync_params(ac)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log('Early stopping at step %d due to reaching max kl.'%i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)    # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)    # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, r, d = reset(env)

    effective_horizon = round(1 / (1 - gamma))
    effective_horizon_rewards = []
    for _ in range(num_agents):
        effective_horizon_rewards.append(deque(maxlen=effective_horizon))

    if hasattr(env, 'agent_index'):
        agent_index = env.agent_index
        agent = env.agents[agent_index]
        is_multi_agent = True
    else:
        agent_index = 0
        agent = None
        is_multi_agent = False

    def get_action_fn(_obz):
        return ac.step(torch.as_tensor(_obz, dtype=torch.float32))

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        epoch_episode = 0
        info = {}
        epoch_ended = False
        step_num = 0
        ep_len = 0
        ep_ret = 0
        while not epoch_ended:
            if try_rollouts != 0:
                # a, v, logp, next_o, r, d, info
                # a, v, logp, obs, r, done, info
                rollout = do_rollouts(
                    get_action_fn, env, o, steps_per_try_rollout, try_rollouts,
                    take_worst_rollout)
            else:
                a, v, logp = get_action_fn(o)
                # NOTE: For multi-agent, steps current agent,
                # but returns values for next agent (from its previous action)!
                # TODO: Just return multiple agents observations
                next_o, r, d, info = env.step(a)

            if render:
                env.render()

            curr_reward = env.curr_reward if is_multi_agent else r

            # save and log
            buf.store(o, a, curr_reward, v, logp, agent_index)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            if 'stats' in info and info['stats']:  # TODO: Optimize this
                logger.store(**info['stats'])

            if is_multi_agent:
                agent_index = env.agent_index
                agent = env.agents[agent_index]

                # TODO: Store vector of these for each agent when changing step API
                ep_len = agent.episode_steps
                ep_ret = agent.episode_reward
            else:
                ep_len += 1
                ep_ret += r

            calc_effective_horizon_reward(
                agent_index, effective_horizon_rewards, logger, r)

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = buf.epoch_ended(step_num)
            if terminal or epoch_ended:
                if epoch_ended and not terminal:
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(agent_index, v)
                if terminal:
                    buf.record_episode(ep_len=ep_len, ep_ret=ep_ret, step_num=step_num)
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    if 'stats' in info and info['stats'] and info['stats']['done_only']:
                        logger.store(**info['stats']['done_only'])
                o, r, d = reset(env)
                if not is_multi_agent:
                    ep_len = 0
                    ep_ret = 0
            step_num += 1

        buf.prepare_for_update()

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('DateTime', get_date_str())
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.log_tabular('HorizonReturn', with_min_and_max=True)
        if getattr(env.unwrapped, 'is_deepdrive', False):
            logger.log_tabular('trip_pct', with_min_and_max=True)
            logger.log_tabular('collided')
            logger.log_tabular('harmful_gs')
            logger.log_tabular('timeup')
            logger.log_tabular('exited_lane')
            logger.log_tabular('circles')
            logger.log_tabular('skipped')
            logger.log_tabular('backwards')
            logger.log_tabular('won')

        if 'stats' in info and info['stats']:
            for stat, value in info['stats'].items():
                logger.log_tabular(stat, with_min_and_max=True)

        if logger.best_category or (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state(dict(env=env), pytorch_save=dict(
                ac=ac.state_dict(),
                pi_optimizer=pi_optimizer.state_dict(),
                vf_optimizer=vf_optimizer.state_dict(),
                epoch=epoch,
                stats=logger.epoch_dict,
            ), itr=None, best_category=logger.best_category)

        logger.dump_tabular()
コード例 #19
0
ファイル: sac.py プロジェクト: eladsar/spinningup
def sac(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=256,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        device='cuda',
        override=True):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act``
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of
            observations as inputs, and ``q1`` and ``q2`` should accept a batch
            of observations and a batch of actions as inputs. When called,
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    device = torch.device(device)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().cpu().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device),
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in tqdm(range(total_steps)):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                # logger.save_state({'env': env, 'rb': replay_buffer.get_state()}, None)
                logger.save_state({'env': env}, None if override else epoch)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #20
0
ファイル: memb.py プロジェクト: RamiSketcher/pddm
def memb(env_fn,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         model=core.MLPModel,
         seed=0,
         steps_per_epoch=1000,
         epochs=200,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         model_lr=3e-4,
         value_lr=1e-3,
         pi_lr=3e-4,
         alpha=0.4,
         batch_size=100,
         start_steps=1000,
         max_ep_len=1000,
         save_freq=1,
         train_model_epoch=5,
         test_freq=5,
         save_epoch=100,
         exp_name='',
         env_name='',
         logger_kwargs=dict()):

    ## Added by Rami >> ##
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    ## << Added by Rami ##

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape  #@!
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    # [pi, q1, q2, v or v')] = MLPActorCritic(obs_space, act_space)
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    # we need a separate target network; bc it's optmz differnetly
    # [_, _, _, v_targ] = MLPActorCritic(obs2_space, act_space)
    ac_targ = deepcopy(ac)
    # Create model module
    # [transiton, reward] = MLPModel(obs_space, act_space)
    md = model(env.observation_space, env.action_space)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for all Value-networks (save this for convenience)
    val_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters(),
                                 ac.v.parameters())

    # List of parameters for all Model-networks (save this for convenience)
    md_params = itertools.chain(md.transition.parameters(),
                                md.reward.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    ## Added by Rami >> ##
    # Count variables
    # var_counts = tuple(core.count_vars(scope) for scope in ['main/dm', 'main/rm', 'main/pi', 'main/v', 'main/q1', 'main/q2', 'main'])
    var_counts = tuple(
        core.count_vars(module) for module in
        [md.transition, md.reward, ac.pi, ac.q1, ac.q2, ac.v, md, ac])
    # print('\nNumber of parameters: \t dm: %d, \t rm: %d, \t pi: %d, \t v: %d, \t q1: %d, \t q2: %d, \t total: %d\n'%var_counts)
    logger.log(
        '\nNumber of parameters: \t dm: %d, \t rm: %d, \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d+%d\n'
        % var_counts)

    ## << Added by Rami ##

    # TD3 style Q function updates #

    ## Optimized costs\losses ##

    # o, a, r, o2, d = data['obs'],
    #                  data['act'],
    #                  data['rew'],
    #                  data['obs2'],
    #                  data['done']

    # Set up function for computing Rew/Dyn model-losses
    ### Model/Reward losses (supervised learning):
    #   loss = 0.5*(actual-prediction)^2 }
    #       Jp(omega) = 0.5 Expt_D[(f(s,a)-s')^2] --> eq#4.a
    #       Jr(ph) = Expt_D[(r(s,a)-r)^2] --> eq#4.b
    #           min_omeg,ph{ Jp(omeg), Jr(ph) }
    def compute_loss_model(data):  # Rami (Done)

        o, a, r, o2 = data['obs'], data['act'], data['rew'], data['obs2']

        transition = md.transition(o, a)
        r_rm = md.reward(o, a)

        transition_backup = o2
        r_backup = r

        loss_transition = ((transition_backup - transition)**2).mean()
        loss_r = ((r_backup - r_rm)**2).mean()
        loss_model = loss_transition + loss_r

        # Useful info for logging
        model_info = dict(Dyn=transition.detach().numpy(),
                          Rew=r_rm.detach().numpy())

        return loss_model, model_info

    # Set up function for computing pi loss
    ### Policy loss ###
    #   State value-function of st:
    #       V(st) = Expt_pi[Q(st,at) - log pi(at|st)] --> eq#3.b,
    #   Policy learning's Soft Bellman eq (Reparameterization):
    #       V(s) = Expt_pi[Expt_rm[r_hat(s,pi]
    #                       - alpha*log pi(a|s)
    #                       + gamma*Expt_f[V'(f(s,pi))]] --> eq#8
    #   Optz pi--> max_pi{ Expt_s~D[V(s)] }
    def compute_loss_pi(data):  # Rami (Done)

        o = data['obs']

        pi, logp_pi = ac.pi(o)

        transition_pi = md.transition(o, pi)
        r_rm_pi = md.reward(o, pi)
        v_prime = ac.v(transition_pi)

        # Entropy-regularized policy loss
        loss_pi = -(r_rm_pi - alpha * logp_pi + gamma *
                    (1 - d) * v_prime).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up function for computing Q,V value-losses
    ### Value functions losses ###
    #   Optz--> min_phi,psi{ Jq(phi),Jv(psi) }
    def compute_loss_val(data):  # Rami (Done)

        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        # pi, logp_pi = ac.pi(o)

        # Optimizesd functions
        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)
        v = ac.v(o)

        # q1_pi = ac.q1(o,pi)
        # q2_pi = ac.q2(o,pi)
        # min_q_pi = torch.min(q1_pi, q2_pi)

        # Bellman backup for Value functions
        with torch.no_grad():
            # Target value function
            pi, logp_pi = ac.pi(o)
            v_targ = ac_targ.v(o2)

            q1_pi = ac.q1(o, pi)
            q2_pi = ac.q2(o, pi)
            min_q_pi = torch.min(q1_pi, q2_pi)

            q_backup = r + gamma * (1 - d) * v_targ  # By Rami
            v_backup = min_q_pi - alpha * logp_pi  # By Rami

        # MSE loss against Bellman backup
        loss_q1 = ((q_backup - q1)**2).mean()
        loss_q2 = ((q_backup - q2)**2).mean()
        loss_v = ((v_backup - v)**2).mean()
        loss_val = loss_q1 + loss_q2 + loss_v

        # Useful info for logging
        val_info = dict(Q1Vals=q1.detach().numpy(),
                        Q2Vals=q2.detach().numpy(),
                        V_Vals=v.detach().numpy())

        return loss_val, val_info

    # Set up optimizers for model, policy and value-functions
    model_optimizer = Adam(md_params, lr=model_lr)  # Rami
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    val_optimizer = Adam(val_params, lr=value_lr)  # Rami

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def updateAC(data):  # Rami (Done)
        # print("AC updating..")
        # First run one gradient descent step for Q1, Q2, and V
        val_optimizer.zero_grad()
        loss_val, val_info = compute_loss_val(data)
        loss_val.backward()  # Descent
        val_optimizer.step()

        # Record things
        logger.store(LossVal=loss_val.item(), **val_info)

        # Freeze Value-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in val_params:
            p.requires_grad = False
        # Freeze Model-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in md_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        (loss_pi).backward()  # Ascent
        pi_optimizer.step()

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Unfreeze Value-networks so you can optimize it at next Update step.
        for p in val_params:
            p.requires_grad = True
        # Unfreeze Value-networks so you can optimize it at Model Update step.
        for p in md_params:
            p.requires_grad = True

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

        # print("..AC updated")

    def updateModel(data):  # Rami (Done)
        # print("Model updating..")
        # Run one gradient descent step for model
        model_optimizer.zero_grad()
        loss_model, model_info = compute_loss_model(data)
        loss_model.backward()  # Descent
        model_optimizer.step()

        # Record things
        logger.store(LossModel=loss_model.item(), **model_info)
        # logger.store(LossRew=loss_model.item(), **model_info)
        # print("..Model updated")

    def get_action(o, deterministic=False):  # Rami (Done)
        return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic)

    def test_agent(epoch, n=1):  # (Done)
        global mu, pi, q1, q2, q1_pi, q2_pi
        total_reward = 0
        for j in range(n):  # repeat n=5 times
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            total_reward += ep_ret
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)  ## By Rami

        # print('The '+str(epoch)+' epoch is finished!')
        # print('The test reward is '+str(total_reward/n))
        return total_reward / n

    start_time = time.time()  ## Rami
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs
    reward_recorder = []

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        The algorithm would take total_steps totally in the training
        """

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()  # Random for 1k (epoch 1)

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d  # Don't let the env done if just reach max_ep_length

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical!, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling if [(env is done) or (max_ep_legth reached)]
        if d or (ep_len == max_ep_len):

            ## Added by Rami >> ##
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            ## << Added by Rami ##

            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Learning/Training
        #   Train pi, Q, and V after 5 epochs for 5 times,
        #   Train dyn/rew models from start:
        if t // steps_per_epoch > train_model_epoch:  # if epoch > 5
            # Train 5 steps of Q, V, and pi,
            # then train 1 step of model.
            for j in range(5):
                batch = replay_buffer.sample_batch(batch_size)
                updateAC(data=batch)
            updateModel(data=batch)  # Rami
        else:
            # pretrain the model
            batch = replay_buffer.sample_batch(batch_size)
            updateModel(data=batch)  # Rami

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            ## Added by Rami >> ##
            # Save model after each epoch:
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)
            ## << Added by Rami ##

            # if epoch > 5 and epoch % 10 == 0 start to test the agent:
            if epoch > train_model_epoch and epoch % test_freq == 0:
                # if epoch > train_model_epoch and epoch % 1 == 0:
                # test the agent when we reach the test_freq:
                reward_test = test_agent(epoch)
                # save the experiment result:
                # reward_recorder.append(reward_test)
                # reward_nparray = np.asarray(reward_recorder)
                # np.save(str(exp_name)+'_'+str(env_name)+'_'+str(save_freq)+'.npy',reward_nparray)

                ## Added by Rami >> ##
                logger.log_tabular('Epoch', epoch)

                logger.log_tabular('EpRet', with_min_and_max=True)
                logger.log_tabular('EpLen', average_only=True)

                logger.log_tabular('TestEpRet',
                                   with_min_and_max=True)  # if n=1 no variance
                logger.log_tabular('TestEpLen', average_only=True)

                logger.log_tabular('TotalEnvInteracts', t)

                logger.log_tabular('LogPi', with_min_and_max=True)
                logger.log_tabular('LossPi', average_only=True)

                logger.log_tabular('Q1Vals', with_min_and_max=True)
                logger.log_tabular('Q2Vals', with_min_and_max=True)
                # logger.log_tabular('LossQ1', average_only=True)
                # logger.log_tabular('LossQ2', average_only=True)
                logger.log_tabular('V_Vals', with_min_and_max=True)
                # logger.log_tabular('LossV', average_only=True)
                logger.log_tabular('LossVal', average_only=True)

                ## Added by Rami >> ##
                # logger.log_tabular('DynM', with_min_and_max=True)
                # logger.log_tabular('RewM', with_min_and_max=True)
                # logger.log_tabular('LossModel', average_only=True)
                logger.log_tabular('LossDyn', average_only=True)
                logger.log_tabular('LossRew', average_only=True)
                ## << Added by Rami ##

                logger.log_tabular('Time', time.time() - start_time)
                logger.dump_tabular()
コード例 #21
0
def my_td3(env_fn,
           seed=0,
           steps_per_epoch=4000,
           epochs=100,
           max_ep_len=1000,
           hidden_sizes=[256, 256],
           logger_kwargs=dict(),
           save_freq=1,
           batch_size=100,
           start_steps=10000,
           update_after=1000,
           update_every=50,
           num_test_episodes=10,
           gamma=0.99,
           polyak=0.995,
           act_noise=0.1,
           pi_lr=1e-3,
           q_lr=1e-3,
           buffer_size=int(1e6),
           target_noise=0.2,
           noise_clip=0.5,
           policy_delay=2):
    """
    My TD3 implementation
    """

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    test_env = env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    print("env.observation_space", env.observation_space)
    print("env.observation_space.shape", env.observation_space.shape)
    print("env.action_space", env.action_space)
    action_min = env.action_space.low[0]
    action_max = env.action_space.high[0]
    if isinstance(env.action_space, gym.spaces.Discrete):
        print("Discrete action space not supported for my_td3!")
        return

    # Set up experience buffer
    buf = ReplayBuffer(obs_dim, act_dim, buffer_size)

    # Instantiate models
    assert action_max == abs(action_min)
    policy = DeterministicPolicyNet(obs_dim, act_dim, hidden_sizes, action_max)
    policy_target = copy.deepcopy(policy)
    policy_optimizer = torch.optim.Adam(policy.mu_net.parameters(), lr=pi_lr)

    # Two Q-functions for Double Q Learning
    q_function_1 = QNet(obs_dim, act_dim, hidden_sizes)
    q_function_target_1 = copy.deepcopy(q_function_1)
    q_optimizer_1 = torch.optim.Adam(q_function_1.q_net.parameters(), lr=q_lr)
    q_function_2 = QNet(obs_dim, act_dim, hidden_sizes)
    q_function_target_2 = copy.deepcopy(q_function_2)
    q_optimizer_2 = torch.optim.Adam(q_function_2.q_net.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(policy)
    # TODO: Save value network as well

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p_targ in policy_target.parameters():
        p_targ.requires_grad = False
    for q_targ in q_function_target_1.parameters():
        q_targ.requires_grad = False
    for q_targ in q_function_target_2.parameters():
        q_targ.requires_grad = False

    # Prepare for interaction with environment
    num_steps = epochs * steps_per_epoch
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for step in range(
            num_steps
    ):  # TODO: Change to for loop over range(epochs) and range(steps_per_epoch)

        with torch.no_grad():
            if step < start_steps:
                # Until start_steps have elapsed, randomly sample actions
                # from a uniform distribution for better exploration. Afterwards,
                # use the learned policy (with some noise, via act_noise).
                a = env.action_space.sample()
            else:
                assert o.shape == (obs_dim, )
                a = policy(torch.tensor(o, dtype=torch.float32).unsqueeze(0))
                assert a.shape == (1, act_dim)
                a = a[0]  # Remove batch dimension
                a = torch.clamp(a + act_noise * torch.randn(act_dim),
                                action_min,
                                action_max)  # Add exploration noise
                a = a.numpy()  # Convert to numpy

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len == max_ep_len else d

            buf.store(o, a, r, next_o, d)

            # Update obs (critical!)
            o = next_o

            # Trajectory finished
            if d or (ep_len == max_ep_len):
                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        if step >= update_after and step % update_every == 0:
            for j in range(update_every):

                def update():
                    o, a, r, next_o, d = buf.sample_batch(batch_size)

                    # Compute targets
                    with torch.no_grad():
                        next_a_targ = policy_target(next_o)
                        # TD3 modification 1: Target policy smoothing
                        eps = torch.clamp(
                            torch.randn_like(next_a_targ) * target_noise,
                            -noise_clip, noise_clip)
                        next_a_targ = torch.clamp(next_a_targ + eps,
                                                  action_min, action_max)

                        # Clipped Double Q-Learning
                        next_q_targ_1 = q_function_target_1(
                            next_o, next_a_targ)
                        next_q_targ_2 = q_function_target_2(
                            next_o, next_a_targ)
                        next_q_targ = torch.min(next_q_targ_1, next_q_targ_2)
                        q_targ_1 = r + gamma * (1 - d) * next_q_targ
                        q_targ_2 = r + gamma * (1 - d) * next_q_targ

                    # Update Q functions
                    q_optimizer_1.zero_grad()
                    q_loss_1 = ((q_function_1(o, a) - q_targ_1)**2).mean()
                    q_loss_1.backward()
                    q_optimizer_1.step()

                    q_optimizer_2.zero_grad()
                    q_loss_2 = ((q_function_2(o, a) - q_targ_2)**2).mean()
                    q_loss_2.backward()
                    q_optimizer_2.step()

                    # Delayed policy updates
                    if j % policy_delay == 0:

                        # Freeze Q-network so you don't waste computational effort
                        # computing gradients for it during the policy learning step.
                        for p in q_function_1.parameters():
                            p.requires_grad = False
                        for p in q_function_2.parameters():
                            p.requires_grad = False

                        # Policy function update
                        policy_optimizer.zero_grad()
                        policy_loss = -(q_function_1(o, policy(o))).mean()
                        policy_loss.backward()
                        policy_optimizer.step()

                        # Unfreeze Q-network so you can optimize it at next DDPG step.
                        for p in q_function_1.parameters():
                            p.requires_grad = True
                        for p in q_function_2.parameters():
                            p.requires_grad = True

                        # Update target networks with polyak
                        with torch.no_grad():
                            for p, p_targ in zip(policy.parameters(),
                                                 policy_target.parameters()):
                                p_targ.data.mul_(polyak)
                                p_targ.data.add_((1 - polyak) * p.data)
                            for q, q_targ in zip(
                                    q_function_1.parameters(),
                                    q_function_target_1.parameters()):
                                q_targ.data.mul_(polyak)
                                q_targ.data.add_((1 - polyak) * q.data)
                            for q, q_targ in zip(
                                    q_function_2.parameters(),
                                    q_function_target_2.parameters()):
                                q_targ.data.mul_(polyak)
                                q_targ.data.add_((1 - polyak) * q.data)

                update()

        if (step + 1) % steps_per_epoch == 0:
            epoch = (step + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            def test_agent():
                with torch.no_grad():
                    for j in range(num_test_episodes):
                        o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
                        while not (d or (ep_len == max_ep_len)):
                            # Take deterministic actions at test time
                            a = policy(
                                torch.tensor(o,
                                             dtype=torch.float32).unsqueeze(0))
                            assert a.shape == (1, act_dim)
                            a = a[0]  # Remove batch dimension
                            a = a.numpy()  # Convert to numpy
                            o, r, d, _ = test_env.step(a)
                            ep_ret += r
                            ep_len += 1
                        logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', step)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #22
0
def ppo(env,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=2048,
        epochs=250,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=100,
        train_v_iters=70,
        lam=0.95,
        max_ep_len=512,
        target_kl=0.005,
        logger_kwargs=dict(),
        save_freq=5):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env(
        "PandaPegIn",
        has_offscreen_renderer=True,
        # has_renderer=True,
        use_camera_obs=False,
        control_freq=100,
    )

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    #加载预训练模型
    # fname = "data/ppo_peg_in_add_delta_pos_plus_plus/ppo_peg_in_add_delta_pos_plus_plus_s0/pyt_save/model24.pt"
    # pre_model = torch.load(fname)
    # ac.pi = pre_model.pi
    # ac.v =pre_model.v

    #使用TensorboardX
    writer = logger.create_writer()
    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(
            obs, act
        )  #变化的是网络pi   data['obs'],data['act'],data['adv'],data['logp']在回合内未变
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info  #需要最小化loss_pi,pi_info包括kl散度、熵、越界程度(都针对单个回合)

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()  #一次更新改变一次data

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):  #在kl散度不超标的前提下尽可能减小损失
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()
            # print(i,':',loss_v)
            # print('='*20)

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    #运动到初始位置
    pre_action = [0, 0, 0]
    for i in range(4):
        o, _, _, _ = env.step(pre_action)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        print("epoch:", epoch)
        for t in range(local_steps_per_epoch):
            # if( t == steps_per_epoch/2 ):
            # print("Half finished!")
            #通过policy网络和值函数网络计算出:动作、值函数和采取这个动作的概率
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r  #单次游戏回报
            ep_len += 1  #单次游戏时长

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:  #game die;   game超出时间;   回合结束
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)  #计算GAE和rewards-to-go

                # print("steps:",t)
                # print("done",d)

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, epoch)

        # Perform PPO update!
        update()

        #将数据写入TensorboardX
        stats_to_write = logger.get_stats('EpRet')
        writer.add_scalar('AverageEpRet',
                          stats_to_write[0],
                          global_step=(epoch + 1) * 2048)

        # Log info about epoch  一个回合的数据
        logger.log_tabular('Epoch', epoch)  #第几个回合
        logger.log_tabular('EpRet',
                           with_min_and_max=True)  #回报的最大、最小、平均值,游戏结束时停留的状态的回报
        logger.log_tabular('EpLen', average_only=True)  #单次游戏长度的平均值
        logger.log_tabular('VVals', with_min_and_max=True)  #值函数的最大、最小、平均值
        logger.log_tabular('TotalEnvInteracts',
                           (epoch + 1) * steps_per_epoch)  #目前总步数
        logger.log_tabular('LossPi', average_only=True)  #回合开始时策略网络的损失
        logger.log_tabular('LossV', average_only=True)  #回合开始时值网络的损失
        logger.log_tabular('DeltaLossPi', average_only=True)  #策略网络回合结束损失-开始损失
        logger.log_tabular('DeltaLossV', average_only=True)  #值略网络回合结束损失-开始损失
        logger.log_tabular('Entropy', average_only=True)  #?
        logger.log_tabular('KL', average_only=True)  #散度值
        logger.log_tabular('ClipFrac', average_only=True)  #越界程度
        logger.log_tabular('StopIter', average_only=True)  #ppo策略网络迭代次数
        logger.log_tabular('Time', time.time() - start_time)  #回合时间
        logger.dump_tabular()


# if __name__ == '__main__':
#     import argparse
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
#     parser.add_argument('--hid', type=int, default=64)
#     parser.add_argument('--l', type=int, default=2)
#     parser.add_argument('--gamma', type=float, default=0.99)
#     parser.add_argument('--seed', '-s', type=int, default=0)
#     parser.add_argument('--cpu', type=int, default=1)
#     parser.add_argument('--steps', type=int, default=4000)
#     parser.add_argument('--epochs', type=int, default=50)
#     parser.add_argument('--exp_name', type=str, default='ppo')
#     args = parser.parse_args()

#     mpi_fork(args.cpu)  # run parallel code with mpi

#     from spinup.utils.run_utils import setup_logger_kwargs
#     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)

#     ppo(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic,
#         ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma,
#         seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs,
#         logger_kwargs=logger_kwargs)
コード例 #23
0
ファイル: eglu.py プロジェクト: eladsar/spinningup
def eglu(env_fn,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         lr=1e-3,
         alpha=0.2,
         batch_size=256,
         start_steps=10000,
         update_after=1000,
         update_every=50,
         num_test_episodes=10,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1,
         eps=0.2,
         n_explore=32,
         device='cuda'):

    device = torch.device(device)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # Set up function for computing EGL mean-gradient-losses
    def compute_loss_g(data):

        o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        a2 = ball_explore(a1, n_explore, eps)

        a2 = a2.view(n_explore * len(r), act_dim)
        o_expand = repeat_and_reshape(o, n_explore)

        # Bellman backup for Q functions
        with torch.no_grad():

            q1 = ac.q1(o_expand, a2)
            q2 = ac.q2(o_expand, a2)
            q_dither = torch.min(q1, q2)

            # Target actions come from *current* policy
            a_tag, logp_a_tag = ac.pi(o_tag)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o_tag, a_tag)
            q2_pi_targ = ac_targ.q2(o_tag, a_tag)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag)

            q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1)

        a1_in = autograd.Variable(a1.data, requires_grad=True)
        q1 = ac.q1(o, a1_in)
        q2 = ac.q2(o, a1_in)
        qa = torch.min(q1, q2).unsqueeze(-1)
        geps = autograd.grad(outputs=qa,
                             inputs=a1_in,
                             grad_outputs=torch.cuda.FloatTensor(
                                 qa.size()).fill_(1.),
                             create_graph=False,
                             retain_graph=True,
                             only_inputs=True)[0]

        geps = repeat_and_reshape(geps, n_explore)
        a1 = repeat_and_reshape(a1, n_explore)

        geps = (geps * (a2 - a1)).sum(-1)
        # l1 loss against Bellman backup

        loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor)

        # Useful info for logging
        g_info = dict(GVals=geps.flatten().detach().cpu().numpy())

        return loss_g, g_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().cpu().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):

        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()

        # Next run one gradient descent step for the mean-gradient
        loss_g, g_info = compute_loss_g(data)
        # Record things
        logger.store(LossG=loss_g.item(), **g_info)

        q_optimizer.zero_grad()

        loss_q, q_info = compute_loss_q(data)
        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        loss_q = loss_q + loss_g

        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in ac.geps.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in ac.geps.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device),
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in tqdm(range(total_steps)):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #24
0
def dqn(env_fn,  q_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=100, replay_size=int(1e4), gamma=0.99,
        polyak=0.995, q_lr=1e-3, batch_size=128, start_steps=0,
        update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10,
        max_ep_len=1000, logger_kwargs=dict(), save_freq=1, tf_logger='logs/dqn/'):

    tf_logger = tf_logger[:-1] + datetime.now().strftime('%Y%m%d%H%M%S') + '/'
    # tt()
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    # set up tensorboard parameters:
    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    # only be applicable for discrete action
    act_dim = env.action_space.n
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # q_func_hidden_size = (256, 128, 64)
    q_func = core.MLPQFunction(obs_dim, act_dim, **q_kwargs).to(device)
    with SummaryWriter(log_dir=tf_logger, comment="DQN_graph") as w:
        dummy_input =torch.rand((1, obs_dim),dtype=torch.float32).to(device)
        w.add_graph(q_func, dummy_input)
    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(m) for m in [q_func])
    q_target = deepcopy(q_func)
    q_target.eval()
    logger.log('\nNumber of parameters:  q: %d\n' % var_counts)
    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in q_target.parameters():
        p.requires_grad = False
    # #test q_func
    # qv = q_func(torch.randn([128, 4]))
    # action = torch.randint(high=1, low=0, size = [128, 2])
    # Set up function for computing double Q-loss
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
        r = Variable(r)
        o2 = Variable(o2)
        a = Variable(a)
        o = Variable(o)
        d = Variable(d)
        q = q_func(o).gather(1, a.unsqueeze(1))
        # Bellman backup for Q function
        with torch.no_grad():
            q_targ = q_target(o2).detach().max(1)[0]
            backup = r + gamma * (1 - d) * q_targ
        # MSE loss against Bellman backup
        loss_q = ((q - backup) ** 2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.cpu().detach().numpy())
        return loss_q, loss_info
    # TODO: change learning rate
    q_optimizer = Adam(q_func.parameters())
    # q_optimizer = RMSprop(q_func.parameters(), lr=0.00025, alpha=0.95, eps=0.01)
    # Set up model saving
    logger.setup_pytorch_saver(q_func)

    def update(data):
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        for param in q_func.parameters():
            param.grad.data.clamp_(-1, 1)

        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(q_func.parameters(), q_target.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
        return loss_q.item()
    # TODO: the initial value for exploration?
    # exploration = core.LinearScheduler(steps_per_epoch * epochs, init_value=0.9, final_value=0.05)
    exploration = core.ExpScheduler(init_value=0.9, final_value=0.05, decay=200)
    def get_action(obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        writer.add_scalar(tag="epsilon", scalar_value=eps_threshold, global_step=t)
        if sample > eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                # return policy_net(state).max(1)[1].view(1, 1)
                obs = torch.from_numpy(obs).unsqueeze(0).type(torch.float32)
                # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
                action = q_func(Variable(obs)).data.max(1)[1].item()
                return action
        else:
            return env.action_space.sample()

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                obs = torch.from_numpy(o).unsqueeze(0).type(torch.float32)
                action = q_func(Variable(obs)).data.max(1)[1].cpu().numpy().squeeze()
                o, r, d, _ = test_env.step(action)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    writer = SummaryWriter(logdir=tf_logger)
    # Main loop: collect experience in env and update/log each epoch

    epoch_counter = 0
    for t in range(total_steps):
        if t > start_steps:
            a = get_action(o, (t + 1) // steps_per_epoch)
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d
        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        o = o2

        if d or ep_len == max_ep_len:
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            epoch_counter += 1
            writer.add_scalar('train_reward', ep_ret, global_step=epoch_counter)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        loss_q = 0
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                loss_q += update(data=batch)

            loss_q /= update_every
            # core.tensorboard_logger(logdir=tf_logger, scalar=loss_q, step=t, tag='q_loss')
            writer.add_scalar('q_loss', loss_q, global_step=t)
        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch
            # core.tensorboard_logger(logdir=tf_logger, scalar=ep_ret, step=epoch, tag='train_reward')
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)
            test_agent()
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
    writer.close()
    return
コード例 #25
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
class SingleTaskDDPG(Approach):
    def __init__(self,
                 action_space,
                 observation_space,
                 rng,
                 eps=0.9,
                 discount_factor=0.99,
                 alpha=1e-3):
        self.rng = rng
        logger_kwargs = setup_logger_kwargs('SingleTaskDDPG', self.rng)
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.actor_critic = MLPActorCritic
        # ac_kwargs=dict() ****?????*****
        # seed=0
        self.replay_size = int(1e6)
        self.polyak = 0.995
        self.gamma = discount_factor
        self.pi_lr = alpha
        self.q_lr = alpha
        self.batch_size = 100
        self.start_steps = 10000
        self.update_after = 1000
        self.update_every = 50
        self.act_noise = 0.1

        self.step_count = 0
        self.action_space = action_space
        self.observation_space = observation_space
        # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(17,), dtype=np.float32) #fix

        # torch.manual_seed(seed)
        # np.random.seed(seed)

        # self.obs_dim = self.observation_space.shape
        self.act_dim = self.action_space.shape[0]
        # act_dim = self.action_space.n

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.action_space.high[0]

        self.net = False

    def init_net(self, state):
        self.obs_dim = state.shape
        # Create actor-critic module and target networks
        self.ac = self.actor_critic(self.obs_dim[0],
                                    self.action_space)  #took out ac_kwargs
        self.ac_targ = deepcopy(self.ac)

        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.ac_targ.parameters():
            p.requires_grad = False

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim,
                                          act_dim=self.act_dim,
                                          size=self.replay_size)

        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.pi_lr)
        self.q_optimizer = Adam(self.ac.q.parameters(), lr=self.q_lr)
        self.logger.setup_pytorch_saver(self.ac)

        self.net = True

    def observe(self, state, action, next_state, reward, done):
        state = self.process_state(state)
        next_state = self.process_state(next_state)

        self.replay_buffer.store(state, action, reward, next_state, done)
        if self.step_count >= self.update_after and self.step_count % self.update_every == 0:
            for _ in range(self.update_every):
                batch = self.replay_buffer.sample_batch(self.batch_size)
                self.update(data=batch)

    # Set up function for computing DDPG Q-loss
    def compute_loss_q(self, data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q = self.ac.q(o, a)

        # Bellman backup for Q function
        with torch.no_grad():
            q_pi_targ = self.ac_targ.q(o2, self.ac_targ.pi(o2))
            backup = r + self.gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q - backup)**2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(self, data):
        o = data['obs']
        q_pi = self.ac.q(o, self.ac.pi(o))
        return -q_pi.mean()

    def update(self, data):
        # First run one gradient descent step for Q.
        self.q_optimizer.zero_grad()
        loss_q, loss_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in self.ac.q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        self.pi_optimizer.zero_grad()
        loss_pi = self.compute_loss_pi(data)
        loss_pi.backward()
        self.pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in self.ac.q.parameters():
            p.requires_grad = True

        self.logger.store(LossQ=loss_q.item(),
                          LossPi=loss_pi.item(),
                          **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(self.ac.parameters(),
                                 self.ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(self.polyak)
                p_targ.data.add_((1 - self.polyak) * p.data)

    def get_action(self, state, exploit=False):
        processed_state = self.process_state(state)
        if not self.net:
            self.init_net(processed_state)

        # state is actually observation
        self.step_count += 1
        if self.step_count <= self.start_steps:
            return self.action_space.sample()

        a = self.ac.act(torch.as_tensor(processed_state, dtype=torch.float32))
        if not exploit:
            a += self.act_noise * np.random.randn(self.act_dim)
        return np.clip(a, -self.act_limit, self.act_limit)

    def reset(self, reward_function):
        self.reward_function = reward_function
        self.net = False
        # self.step_count = 0

    def process_state(self, state):
        return state

    def log(self, returns, task):
        self.logger.store(EpRet=sum(returns), EpLen=len(returns))
        self.logger.save_state({'env': task}, None)
        self.logger.log_tabular('EpRet', with_min_and_max=True)
        self.logger.log_tabular('EpLen', average_only=True)
        self.logger.log_tabular('TotalEnvInteracts', self.step_count)
        self.logger.log_tabular('QVals', with_min_and_max=True)
        self.logger.log_tabular('LossPi', average_only=True)
        self.logger.log_tabular('LossQ', average_only=True)
        self.logger.dump_tabular()

    def load(self, file, task):
        # model = torch.load(file)
        # s = ()
        # for param_tensor in model.state_dict():
        #     s+=(param_tensor, "\t", model.state_dict()[param_tensor].size())
        # return s
        # model = self.actor_critic(17, self.action_space)
        # model.load_state_dict(torch.load(file))
        self.ac = torch.load(file)
        self.ac.eval()

        self.net = True
        state = task.reset(self.rng)
        self.reward_function = task.reward_function
        images = []

        for i in range(100):
            action = self.get_action(state, True)
            state, reward, done, _ = task.step(action)
            im = task.render(mode='rgb_array')
            images.append(im)

            if done:
                break
        imageio.mimsave('figures/DDPG/oracle.mp4', images)
コード例 #26
0
ファイル: ppo.py プロジェクト: hammer-wang/oml-ppo
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        beta=0.01,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=3e-4,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.95,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        use_rnn=False,
        reward_factor=1,
        spectrum_repr=False):
    """
    Proximal Policy Optimization (by clipping), 
    with early stopping based on approximate KL
    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.
        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================
            The ``act`` method behaves the same as ``step`` but only returns ``a``.
            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================
            The ``v`` module's forward call should accept a batch of observations
            and return:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================
        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.
        seed (int): Seed for random number generators.
        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.
        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.
        gamma (float): Discount factor. (Always between 0 and 1.)
        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 
        pi_lr (float): Learning rate for policy optimizer.
        vf_lr (float): Learning rate for value function optimizer.
        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)
        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.
        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)
        max_ep_len (int): Maximum length of trajectory / episode / rollout.
        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)
        logger_kwargs (dict): Keyword args for EpochLogger.
        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.
    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    if rank == 0:
        print(ac)

    # udpate env config
    # env.scalar_thick = ac_kwargs['scalar_thick']
    env.update_with_ac(**ac_kwargs)

    # For Tuple spaces
    obs_dim = ac.obs_dim

    if isinstance(env.action_space, spaces.Tuple):
        act_dim = core.tuple_space_dim(env.action_space, action=True)
    else:
        act_dim = env.action_space.shape

    # Create actor-critic module

    # print(ac)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim,
                    act_dim,
                    local_steps_per_epoch,
                    gamma,
                    lam,
                    cell_size=ac_kwargs['cell_size'])

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):

        obs, act, adv, logp_old, hid = data['obs'], data['act'], data[
            'adv'], data['logp'], data['hid']

        # for i in range(len(obs)-1):
        #     if torch.eq(obs[i], torch.zeros(12)).sum()==12 and torch.eq(obs[i+1], torch.zeros(12)).sum()==12:
        #         print(obs[i], obs[i+1], act[i], act[i+1])

        # Policy loss
        pis = []
        logp = 0

        if len(ac.pi) > 1:  # tuple actions
            for i, actor_i in enumerate(ac.pi):
                pi, logp_i = actor_i(obs, act[:, i][:, None])
                logp += logp_i
                pis.append(pi)
        else:
            pi, logp_i = ac.pi[0](obs, act)
            logp += logp_i
            pis.append(pi)

        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        # sample estimation policy KL
        approx_kl = (logp_old - logp).mean().item()
        ent = sum([pi.entropy().mean().item() for pi in pis])
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return 0.5 * ((ac.v(obs) - ret)**2).mean()

    def compute_loss_pi_v_rnn(data):

        obs, act, adv, logp_old, ret = data['obs'], data['act'], data[
            'adv'], data['logp'], data['ret']

        hid = torch.zeros(ac_kwargs['cell_size'])
        v = []
        logp = []
        ent = []
        num_traj = 0
        #todo: test
        for i in range(len(obs)):
            v_i, logp_i, hid, ent_i = ac.evaluate(obs[i], act[i], hid)
            if i < len(obs) - 1 and obs[i + 1].sum() == 0:
                num_traj += 1
                # print('Reinitialize #{}'.format(num_traj), flush=True)
                hid = torch.zeros(ac_kwargs['cell_size'])
            v.append(v_i)
            logp.append(logp_i)
            ent.append(ent_i)

        logp = torch.cat(logp)
        v = torch.cat(v)

        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # print(logp_old - logp)

        approx_kl = (logp_old - logp).mean().item()
        ent = torch.stack(ent).mean()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        loss_v = 0.5 * ((v - ret)**2).mean()
        # import pdb; pdb.set_trace()

        loss_pi = loss_pi - beta * ent

        logger.store(RetBuf=ret.clone().detach().numpy())
        # import pdb; pdb.set_trace()

        return loss_pi, pi_info, loss_v

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)
    if use_rnn:
        optimizer = Adam(ac.parameters(), lr=pi_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        # import pdb; pdb.set_trace()

        if not use_rnn:
            pi_l_old, pi_info_old = compute_loss_pi(data)
            v_l_old = compute_loss_v(data).item()

            #  Train policy with multiple steps of gradient descent
            for i in range(train_pi_iters):
                pi_optimizer.zero_grad()
                loss_pi, pi_info = compute_loss_pi(data)
                kl = mpi_avg(pi_info['kl'])
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break
                loss_pi.backward()
                mpi_avg_grads(ac.pi)  # average grads across MPI processes
                pi_optimizer.step()

            logger.store(StopIter=i)

            # Value function learning
            for i in range(train_v_iters):
                vf_optimizer.zero_grad()
                if not use_rnn:
                    loss_v = compute_loss_v(data)
                loss_v.backward()
                mpi_avg_grads(ac.v)  # average grads across MPI processes
                vf_optimizer.step()

        else:
            pi_l_old, pi_info_old, v_l_old = compute_loss_pi_v_rnn(data)
            pi_l_old = pi_l_old.item()

            for i in range(train_pi_iters):
                optimizer.zero_grad()
                loss_pi, pi_info, loss_v = compute_loss_pi_v_rnn(data)
                kl = mpi_avg(pi_info['kl'])
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break
                loss = loss_pi + loss_v
                loss.backward()
                mpi_avg_grads(ac)
                optimizer.step()
            logger.store(StopIter=i)

        # import pdb; pdb.set_trace()
        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    obs, ep_ret, ep_len = env.reset(), 0, 0

    # import pdb; pdb.set_trace()
    # if ac_kwargs['scalar_thick']:
    #     thick= obs[env.num_materials:env.num_materials+env.num_thicknesses].argmax() / env.num_thicknesses
    #     obs = np.concatenate((obs[:env.num_materials+1], np.array([thick])))

    #                 if ac_kwargs['scalar_thick']:
    #             thick= obs[env.num_materials:env.num_materials+env.num_thicknesses].argmax() / env.num_thicknesses
    #             obs = np.concatenate((obs[:env.num_materials+1], np.array([thick])))
    hid = np.zeros(
        ac_kwargs['cell_size']) if ac_kwargs['cell_size'] else np.zeros(1)
    # import pdb; pdb.set_trace()

    design_tracker = DesignTracker(epochs, **logger_kwargs)
    total_env_time = 0
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        epoch_start_time = time.time()
        for t in range(local_steps_per_epoch):

            #TODO: only evaluate
            act, v, logp, hid = ac.step(
                torch.as_tensor(obs, dtype=torch.float32),
                torch.as_tensor(hid, dtype=torch.float32))

            # nv_start = time.time()
            next_obs, r, d, _ = env.step(act)
            # env_end = time.time()
            # env_time = env_end - env_start
            # total_env_time += env_time

            r = r * reward_factor  # scale the rewards, possibly match the reward scale of atari
            ep_ret += r
            if not d:
                ep_len += 1

            # save and log
            if use_rnn:
                buf.store(obs, act, r, v, logp, hid)
            else:
                buf.store(obs, act, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            obs = next_obs

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                # print(t)
                # if epoch_ended and not(terminal):
                #     print('Warning: trajectory cut off by epoch at %d steps.'
                #           % ep_len, flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                # if timeout or epoch_ended:
                if not terminal:
                    _, v, _, _ = ac.step(
                        torch.as_tensor(obs, dtype=torch.float32),
                        torch.as_tensor(hid, dtype=torch.float32))
                else:
                    v = 0

                buf.finish_path(v)

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    if hasattr(env, 'layers') and hasattr(env, 'thicknesses'):
                        design_tracker.store(env.layers, env.thicknesses,
                                             ep_ret, epoch)

                        if rank == 0:
                            print(env.layers, env.thicknesses)

                obs, ep_ret, ep_len = env.reset(), 0, 0
                # reinitilize hidden state
                hid = np.zeros(ac_kwargs['cell_size'])
                if hasattr(env, "layers"):
                    logger.store(Act=act[1])

        # Save model

        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)
            design_tracker.save_state()

        # Perform PPO update!
        update()

        elapsed = time.time() - start_time
        epoch_time = time.time() - epoch_start_time
        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        if hasattr(env, 'layers'):
            logger.log_tabular('Act', with_min_and_max=True)
        logger.log_tabular('RetBuf', with_min_and_max=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', elapsed)
        logger.log_tabular('FPS', int(steps_per_epoch / epoch_time))
        logger.dump_tabular()
コード例 #27
0
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(),  seed=0, 
        steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4,
        vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000,
        logger_kwargs=dict(), save_freq=10):
    """
    Vanilla Policy Gradient 

    (with GAE-Lambda for advantage estimation)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    # obs_dim = env.observation_space.n
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing VPG policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        loss_pi = -(logp * adv).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        pi_info = dict(kl=approx_kl, ent=ent)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        # Get loss and info values before update
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with a single step of gradient descent
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        mpi_avg_grads(ac.pi)    # average grads across MPI processes
        pi_optimizer.step()

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            
            bayes_kl_loss = 0.
            if isinstance(ac.v, BayesMLPCritic):
                bayes_kl_loss = ac.v.compute_kl()

            total_loss_v = loss_v + bayes_kl_loss / data['obs'].shape[0]
            total_loss_v.backward()
            
            mpi_avg_grads(ac.v)    # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent = pi_info['kl'], pi_info_old['ent']
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old),
                     BayesKL=bayes_kl_loss)

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    epoch_reward = []
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)
            
            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t==local_steps_per_epoch-1

            if terminal or epoch_ended:
                if epoch_ended and not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    epoch_reward.append(ep_ret)  
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0


        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        if epoch % 10 == 0:
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('DeltaLossPi', average_only=True)
            logger.log_tabular('DeltaLossV', average_only=True)
            logger.log_tabular('Entropy', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('BayesKL', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
    
    return epoch_reward
コード例 #28
0
ファイル: ppo.py プロジェクト: firefly34/implementations
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):

    # Special function to avoid certain slowdowns from PyTorch + MPI combination
    setup_pytorch_for_mpi()

    # Setup logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random Seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate Environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor - critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync parameters across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(
        core.count_variables(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experiment buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up a function for computing PPO Policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy Loss
        pi, log_p = ac.pi(obs, act)
        ratio = torch.exp(log_p - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful Extra Information
        approx_kl = (logp_old - log_p).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clip_fraction = torch.as_tensor(clipped,
                                        dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clip_fraction)

        return loss_pi, pi_info

    # Setup function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Setup optimizers for policy and value functions
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with the environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs(critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or time_out
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not terminal:
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #29
0
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #30
0
def rpg(env_fn=(lambda: gym.make("CartPole-v1")),
        max_traj_length=500,
        batch_size=100,
        num_epochs=100,
        hidden_sizes=(32, 32),
        activation=nn.Tanh,
        pi_lr=0.0003,
        logger_kwargs=dict(),
        writer_kwargs=dict()):
    """
    Assumes env is CartPole-v1; if want to cleanup later, just make the dimensions general to any env
    """
    env = env_fn()
    # Assume obs space is a Box
    obs_dim = env.observation_space.shape[0]
    # Assume action space is Discrete (categorical), which is why we evaluate .n (rather than .shape[0])
    act_dim = env.action_space.n
    pi = MLPCategoricalActor(obs_dim, act_dim, hidden_sizes, activation)
    pi_optimizer = Adam(pi.parameters(), lr=pi_lr)

    logger = EpochLogger(**logger_kwargs)
    logger.setup_pytorch_saver(pi)
    writer = SummaryWriter(**writer_kwargs)

    for ep in range(num_epochs):
        print(f"Epoch num: {ep}")

        # batch arrays; contains relevant data for batch of trajectories
        batch_rewards = torch.zeros(batch_size)
        batch_log_prob = torch.zeros(batch_size)
        for i in range(batch_size):
            o = env.reset()
            d = False  # bool for "done"

            # buffers for o, a, r; contains all values for one trajectory
            # assumes cartpole dimensions
            buffer_o = np.zeros((max_traj_length, obs_dim))
            buffer_a = np.zeros(max_traj_length)
            buffer_r = np.zeros(max_traj_length)
            ptr = 0  # pointer to the position in the buffer

            # take data for one entire trajectory
            while not d:
                o = torch.as_tensor(o, dtype=torch.float32)
                a = pi._distribution(
                    o).sample()  # sample Categorical policy to get an action
                o2, r, d, _ = env.step(a.numpy())
                o2 = torch.as_tensor(o2, dtype=torch.float32)

                buffer_o[ptr] = o.numpy()
                buffer_a[ptr] = a
                buffer_r[ptr] = r

                o = o2
                ptr += 1
                if ptr >= max_traj_length:
                    break

            # save traj data into batch arrays
            batch_rewards[i] = buffer_r[:ptr].sum()
            log_probs = pi._log_prob_from_distribution(
                pi._distribution(
                    torch.as_tensor(buffer_o[:ptr], dtype=torch.float32)),
                torch.as_tensor(buffer_a[:ptr], dtype=torch.float32))
            batch_log_prob[i] = log_probs.sum()

        # run one step of gradient descent optimizer
        pi_optimizer.zero_grad()
        loss = -1 * (batch_log_prob * batch_rewards).mean()
        loss.backward()
        pi_optimizer.step()

        # logging
        writer.add_scalar("pi loss", float(loss), ep)
        writer.add_scalar("avg return", float(batch_rewards.mean()), ep)
        if ep % 10 == 0:
            logger.save_state({'env': env}, None)  # also saves pi

    print("Done training the agent.")
    logger.save_state({'env': env}, None)  # also saves pi
    writer.close()