コード例 #1
0
    paths = []

    for _ in range(N):
        observations = []
        actions = []
        rewards = []

        observation = env.reset()

        for _ in range(T):
            # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
            # sufficient statistics for the action distribution. It should at least contain entries that would be
            # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
            # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
            # not needed.
            action, _ = policy.get_action(observation)
            # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
            # case it is not needed.
            next_observation, reward, terminal, _ = env.step(action)
            observations.append(observation)
            actions.append(action)
            rewards.append(reward)
            observation = next_observation
            if terminal:
                # Finish rollout if terminal state reached
                break

        # We need to compute the empirical return for each time step along the
        # trajectory
        path = dict(
            observations=np.array(observations),
コード例 #2
0
def doit(mode):
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.normalized_env import normalize
    import numpy as np
    import theano
    import theano.tensor as TT
    from lasagne.updates import adam

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(CartpoleEnv())
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
    # Initialize a linear baseline estimator using default hand-crafted features
    if "linbaseline" in mode:
        print('linear baseline')
        baseline = LinearFeatureBaseline(env.spec)
    elif "vanilla" in mode:
        print("zero baseline")
        baseline = ZeroBaseline(env.spec)
    elif mode == "batchavg":
        print('batch average baseline')
        # use a zero baseline but subtract the mean of the discounted returns (see below)
        baseline = ZeroBaseline(env.spec)

    if "_ztrans" in mode:
        print('z transform advantages')
    else:
        print('no z transform')


    # We will collect 100 trajectories per iteration
    N = 50
    # Each trajectory will have at most 100 time steps
    T = 50
    # Number of iterations
    n_itr = 50
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.1

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1
    )
    actions_var = env.action_space.new_tensor_variable(
        'actions',
        extra_dims=1
    )
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True
    )

    results = []
    for _ in range(n_itr):

        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            if "_ztrans" in mode:
                advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)


            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])


        if mode == 'batchavg':
            # in this case `advantages` up to here are just our good old returns, without baseline or z transformation.
            # now we subtract their mean across all episodes.
            advantages = advantages - np.mean(advantages)


        f_train(observations, actions, advantages)
        avgr =  np.mean([sum(p["rewards"]) for p in paths])
        print(('Average Return:',avgr))
        results.append(avgr)
    return results
コード例 #3
0
def run_task(*_):
    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(
        GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True))
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 64))
    # Initialize a linear baseline estimator using default hand-crafted features
    baseline = LinearFeatureBaseline(env.spec)

    # We will collect 100 trajectories per iteration
    N = 3
    # Each trajectory will have at most 100 time steps
    T = 400
    # Number of iterations
    n_itr = 1000
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.001

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1)
    actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = -TT.mean(
        dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True)

    for epoch in range(n_itr):
        logger.push_prefix('epoch #%d | ' % epoch)
        logger.log("Training started")
        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            advantages = (advantages -
                          np.mean(advantages)) / (np.std(advantages) + 1e-8)

            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])

        f_train(observations, actions, advantages)
        returns_to_check = [sum(p["rewards"]) for p in paths]
        print('Average Return:', np.mean(returns_to_check))

        ############################################################################
        logger.log("Training finished")
        logger.save_itr_params(epoch, params)
        logger.dump_tabular(with_prefix=False)
        logger.pop_prefix()

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Steps', epoch * N * T)
        logger.record_tabular('AverageReturn', np.mean(returns_to_check))
        logger.record_tabular('StdReturn', np.std(returns_to_check))
        logger.record_tabular('MaxReturn', np.max(returns_to_check))
        logger.record_tabular('MinReturn', np.min(returns_to_check))
コード例 #4
0
class Bw_Trans_Model:
    def __init__(self, inputSize, outputSize, env, v, learning_rate, batchsize,
                 which_agent, x_index, y_index, num_fc_layers, depth_fc_layers,
                 print_minimal):

        #init vars
        #self.sess = sess
        self.batchsize = batchsize
        self.which_agent = which_agent
        self.x_index = x_index
        self.y_index = y_index
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.print_minimal = print_minimal

        LOW = -1000000
        HIGH = 1000000
        self.act_dim = env.spec.action_space.flat_dim
        self.obs_dim = env.spec.observation_space.flat_dim
        obs_to_act_spec = env.spec
        obsact_to_obs_spec = EnvSpec(observation_space=Box(
            LOW, HIGH, shape=(self.obs_dim + self.act_dim, )),
                                     action_space=Box(LOW,
                                                      HIGH,
                                                      shape=(self.obs_dim, )))

        #TODO: Think, whether to learn std for backwards policy or not.
        self.bw_act_pol = GaussianMLPPolicy(
            env_spec=obs_to_act_spec,
            hidden_sizes=(64, 64),
            learn_std=v['bw_variance_learn'],
        )

        self.bw_obs_pol = GaussianMLPPolicy(
            env_spec=obsact_to_obs_spec,
            hidden_sizes=(v['bw_model_hidden_size'],
                          v['bw_model_hidden_size']),
            learn_std=v['bw_variance_learn'],
            hidden_nonlinearity=NL.rectify,
        )

        self.obs_in = TT.matrix('obs_in')
        self.obsact_in = TT.matrix('obsact_in')
        self.act_out = TT.matrix('act_out')
        self.diff_out = TT.matrix('diff_out')

        bw_learning_rate = v['bw_learning_rate']
        self.bw_act_dist = self.bw_act_pol.dist_info_sym(self.obs_in)
        self.bw_obs_dist = self.bw_obs_pol.dist_info_sym(self.obsact_in)
        self.bw_act_loss = -TT.sum(
            self.bw_act_pol.distribution.log_likelihood_sym(
                self.act_out, self.bw_act_dist))
        bw_obs_loss = -TT.sum(
            self.bw_obs_pol.distribution.log_likelihood_sym(
                self.diff_out, self.bw_obs_dist))

        bw_act_params = self.bw_act_pol.get_params_internal()
        bw_obs_params = self.bw_obs_pol.get_params_internal()
        #bw_params = bw_act_params + bw_obs_params
        bw_s_to_a_update = lasagne.updates.adam(self.bw_act_loss,
                                                bw_act_params,
                                                learning_rate=bw_learning_rate)
        bw_sa_to_s_update = lasagne.updates.adam(
            bw_obs_loss, bw_obs_params, learning_rate=bw_learning_rate)

        self.bw_act_train = theano.function([self.obs_in, self.act_out],
                                            self.bw_act_loss,
                                            updates=bw_s_to_a_update,
                                            allow_input_downcast=True)
        self.bw_obs_train = theano.function([self.obsact_in, self.diff_out],
                                            bw_obs_loss,
                                            updates=bw_sa_to_s_update,
                                            allow_input_downcast=True)

    def train(self, dataX, dataZ, dataX_new, dataZ_new, nEpoch, save_dir,
              fraction_use_new):

        #init vars
        start = time.time()
        training_loss_list = []
        nData_old = dataX.shape[0]
        num_new_pts = dataX_new.shape[0]

        #how much of new data to use per batch
        if (num_new_pts < (self.batchsize * fraction_use_new)):
            batchsize_new_pts = num_new_pts  #use all of the new ones
        else:
            batchsize_new_pts = int(self.batchsize * fraction_use_new)

        #how much of old data to use per batch
        batchsize_old_pts = int(self.batchsize - batchsize_new_pts)

        #training loop
        for i in range(nEpoch):

            #reset to 0
            avg_loss = 0
            num_batches = 0

            if (batchsize_old_pts > 0):
                print("nothing is going on")

            #train completely from new set
            else:
                for batch in range(
                        int(math.floor(num_new_pts / batchsize_new_pts))):

                    #walk through the shuffled new data
                    dataX_batch = dataX_new[batch *
                                            batchsize_new_pts:(batch + 1) *
                                            batchsize_new_pts, :]
                    dataZ_batch = dataZ_new[batch *
                                            batchsize_new_pts:(batch + 1) *
                                            batchsize_new_pts, :]

                    data_x = dataX_batch[:, 0:self.obs_dim]
                    data_y = dataX_batch[:, self.obs_dim:]

                    loss = self.bw_act_train(data_x, data_y)
                    bw_obs_losses = self.bw_obs_train(dataX_batch, dataZ_batch)

                    training_loss_list.append(loss)
                    avg_loss += bw_obs_losses  #[0]
                    num_batches += 1

                #shuffle new dataset after an epoch (if training only on it)
                p = npr.permutation(dataX_new.shape[0])
                dataX_new = dataX_new[p]
                dataZ_new = dataZ_new[p]

            #save losses after an epoch
            np.save(save_dir + '/training_losses.npy', training_loss_list)
            if (not (self.print_minimal)):
                if ((i % 10) == 0):
                    print("\n=== Epoch {} ===".format(i))
                    print("loss: ", avg_loss / num_batches)

        if (not (self.print_minimal)):
            print("Training set size: ", (nData_old + dataX_new.shape[0]))
            print("Training duration: {:0.2f} s".format(time.time() - start))

        #done
        return (avg_loss / num_batches)  #, old_loss, new_loss

    #multistep prediction using the learned dynamics model at each step
    def do_forward_sim(self, forwardsim_x_true, num_step, many_in_parallel,
                       env_inp, which_agent, mean_x, mean_y, mean_z, std_x,
                       std_y, std_z):

        #init vars
        state_list = []
        action_list = []
        if (many_in_parallel):
            #init vars
            print("Future work..")
        else:
            curr_state = np.copy(
                forwardsim_x_true)  #curr state is of dim NN input
            for i in range(num_step):
                curr_state_preprocessed = curr_state - mean_x
                curr_state_preprocessed = np.nan_to_num(
                    curr_state_preprocessed / std_x)
                action = self.bw_act_pol.get_action(curr_state_preprocessed)[0]
                action_ = action * std_y + mean_y
                state_difference = self.bw_obs_pol.get_action(
                    np.concatenate((curr_state_preprocessed, action)))[0]
                state_differences = (state_difference * std_z) + mean_z
                next_state = curr_state + state_differences
                #copy the state info
                curr_state = np.copy(next_state)
                state_list.append(np.copy(curr_state))
                action_list.append(np.copy(action_))

        return state_list, action_list