コード例 #1
0
def doit(mode):
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.normalized_env import normalize
    import numpy as np
    import theano
    import theano.tensor as TT
    from lasagne.updates import adam

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(CartpoleEnv())
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
    # Initialize a linear baseline estimator using default hand-crafted features
    if "linbaseline" in mode:
        print('linear baseline')
        baseline = LinearFeatureBaseline(env.spec)
    elif "vanilla" in mode:
        print("zero baseline")
        baseline = ZeroBaseline(env.spec)
    elif mode == "batchavg":
        print('batch average baseline')
        # use a zero baseline but subtract the mean of the discounted returns (see below)
        baseline = ZeroBaseline(env.spec)

    if "_ztrans" in mode:
        print('z transform advantages')
    else:
        print('no z transform')


    # We will collect 100 trajectories per iteration
    N = 50
    # Each trajectory will have at most 100 time steps
    T = 50
    # Number of iterations
    n_itr = 50
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.1

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1
    )
    actions_var = env.action_space.new_tensor_variable(
        'actions',
        extra_dims=1
    )
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True
    )

    results = []
    for _ in range(n_itr):

        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            if "_ztrans" in mode:
                advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)


            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])


        if mode == 'batchavg':
            # in this case `advantages` up to here are just our good old returns, without baseline or z transformation.
            # now we subtract their mean across all episodes.
            advantages = advantages - np.mean(advantages)


        f_train(observations, actions, advantages)
        avgr =  np.mean([sum(p["rewards"]) for p in paths])
        print(('Average Return:',avgr))
        results.append(avgr)
    return results
コード例 #2
0
def train(variant):
    set_global_seeds(variant['seed'])

    if variant['mode'] == 'local':
        import colored_traceback.always
    '''
    Set-up folder and files
    '''
    snapshot_dir = logger.get_snapshot_dir()
    working_dir = config.PROJECT_PATH
    param_path = os.path.join(working_dir, 'params/params.json')
    # copyfile(param_path, os.path.join(snapshot_dir,'params.json'))

    try:
        '''
        Save parameters
        '''
        if 'params' in variant:
            logger.log('Load params from variant.')
            params = variant['params']
        else:
            logger.log('Load params from file.')
            with open(param_path, 'r') as f:
                params = json.load(f)

        # Save to snapshot dir
        new_param_path = os.path.join(snapshot_dir, 'params.json')
        with open(new_param_path, 'w') as f:
            json.dump(params,
                      f,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))

        # TODO: can use variant to modify here.
        dynamics_opt_params = params['dynamics_opt_params']
        dynamics_opt_params['stop_critereon'] = stop_critereon(
            threshold=dynamics_opt_params['stop_critereon']['threshold'],
            offset=dynamics_opt_params['stop_critereon']['offset'])
        dynamics_opt_params = Dynamics_opt_params(**dynamics_opt_params)

        policy_opt_params = params['policy_opt_params']
        policy_opt_params['stop_critereon'] = stop_critereon(
            threshold=policy_opt_params['stop_critereon']['threshold'],
            offset=policy_opt_params['stop_critereon']['offset'],
            percent_models_threshold=policy_opt_params['stop_critereon']
            ['percent_models_threshold'])
        policy_opt_params = Policy_opt_params(**policy_opt_params)

        rollout_params = params['rollout_params']
        rollout_params['monitorpath'] = os.path.join(snapshot_dir, 'videos')
        rollout_params = Rollout_params(**rollout_params)

        assert params['rollout_params']['max_timestep'] == \
               params['policy_opt_params']['oracle_maxtimestep'] == \
               params['policy_opt_params']['T']
        '''
        Policy model
        '''
        def build_policy_from_rllab(scope_name='training_policy'):
            '''
            Return both rllab policy and policy model function.
            '''
            sess = tf.get_default_session()

            ### Initialize training_policy to copy from policy
            from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
            output_nonlinearity = eval(params['policy']['output_nonlinearity'])

            training_policy = GaussianMLPPolicy(
                name=scope_name,
                env_spec=env.spec,
                hidden_sizes=params['policy']['hidden_layers'],
                init_std=policy_opt_params.trpo['init_std'],
                output_nonlinearity=output_nonlinearity)
            training_policy_vars = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, scope='training_policy')
            sess.run([tf.variables_initializer(training_policy_vars)])

            ### Compute policy model function using the same weights.
            training_layers = training_policy._mean_network.layers

            def policy_model(x, stochastic=0.0, collect_summary=False):
                assert (training_layers[0].shape[1] == x.shape[1])
                h = x
                for i, layer in enumerate(training_layers[1:]):
                    w = layer.W
                    b = layer.b
                    pre_h = tf.matmul(h, w) + b
                    h = layer.nonlinearity(pre_h, name='policy_out')
                    if collect_summary:
                        with tf.name_scope(scope_name + '/observation'):
                            variable_summaries(x)
                        with tf.name_scope(scope_name + '/layer%d' % i):
                            with tf.name_scope('weights'):
                                variable_summaries(w)
                            with tf.name_scope('biases'):
                                variable_summaries(b)
                            with tf.name_scope('Wx_plus_b'):
                                tf.summary.histogram('pre_activations', pre_h)
                            tf.summary.histogram('activations', h)
                std = training_policy._l_std_param.param
                h += stochastic * tf.random_normal(
                    shape=(tf.shape(x)[0], n_actions)) * tf.exp(std)
                return h

            return training_policy, policy_model

        '''
        Dynamics model
        '''

        def get_value(key, dict):
            return key in dict and dict[key]

        def prepare_input(xgu, xgu_norm, scope_name, variable_name,
                          collect_summary, prediction_type):
            name_scope = '%s/%s' % (scope_name, variable_name)
            assert n_states > 1 and n_actions > 1 \
                   and xgu.shape[1] == n_states + n_actions + n_goals
            xu = tf.concat([xgu[:, :n_states], xgu[:, n_states + n_goals:]],
                           axis=1)
            xu_norm = tf.concat(
                [xgu_norm[:, :n_states], xgu_norm[:, n_states + n_goals:]],
                axis=1)
            # Collect data summaries
            if collect_summary:
                with tf.name_scope(name_scope + '/inputs'):
                    with tf.name_scope('states'):
                        data_summaries(xgu[:, :n_states])
                    with tf.name_scope('goals'):
                        data_summaries(xgu[:, n_states:n_states + n_goals])
                    with tf.name_scope('actions'):
                        data_summaries(xgu[:, n_states + n_goals:])
            # Ignore xy in the current state.
            if get_value('ignore_xy_input', params['dynamics_model']):
                n_inputs = n_states + n_actions - 2
                nn_input = xu_norm[:, 2:]
            elif get_value('ignore_x_input', params['dynamics_model']):
                n_inputs = n_states + n_actions - 1
                nn_input = xu_norm[:, 1:]
            else:
                n_inputs = n_states + n_actions
                nn_input = xu_norm
            hidden_layers = list(params['dynamics_model']['hidden_layers'])
            nonlinearity = [
                eval(_x) for _x in params['dynamics_model']['nonlinearity']
            ]
            assert (len(nonlinearity) == len(hidden_layers))
            # Verify if the input type is valid.
            if prediction_type == 'state_change' or \
                            prediction_type == 'state_change_goal':
                n_outputs = n_states
            else:
                assert prediction_type == 'second_derivative' or \
                       prediction_type == 'second_derivative_goal'
                n_outputs = int(n_states / 2)
            nonlinearity.append(tf.identity)
            hidden_layers.append(n_outputs)
            return xu, nn_input, n_inputs, n_outputs, \
                   nonlinearity, hidden_layers

        def build_ff_neural_net(nn_input,
                                n_inputs,
                                hidden_layers,
                                nonlinearity,
                                scope_name,
                                variable_name,
                                collect_summary,
                                logit_weights=None,
                                initializer=layers.xavier_initializer(),
                                dropout=False):
            assert len(hidden_layers) == len(nonlinearity)
            name_scope = '%s/%s' % (scope_name, variable_name)
            h = nn_input
            n_hiddens = n_inputs
            n_hiddens_next = hidden_layers[0]
            for i in range(len(hidden_layers)):
                w = get_scope_variable(scope_name,
                                       "%s/layer%d/weights" %
                                       (variable_name, i),
                                       shape=(n_hiddens, n_hiddens_next),
                                       initializer=initializer)
                b = get_scope_variable(scope_name,
                                       "%s/layer%d/biases" %
                                       (variable_name, i),
                                       shape=(n_hiddens_next),
                                       initializer=initializer)
                if collect_summary:
                    with tf.name_scope(name_scope + '/layer%d' % i):
                        with tf.name_scope('weights'):
                            variable_summaries(w)
                        with tf.name_scope('biases'):
                            variable_summaries(b)
                        with tf.name_scope('Wx_plus_b'):
                            pre_h = tf.matmul(h, w) + b
                            # Yunfei: dropout option is useless now
                            if dropout:
                                # if i == 0:
                                #     pre_h = tf.nn.dropout(tf.matmul(h,w), keep_prob=0.8) + b
                                # else:
                                pre_h = tf.nn.dropout(tf.matmul(h, w),
                                                      keep_prob=dropout) + b
                            tf.summary.histogram('pre_activations', pre_h)
                        h = nonlinearity[i](pre_h, name='activation')
                        tf.summary.histogram('activations', h)
                else:
                    pre_h = tf.matmul(h, w) + b
                    h = nonlinearity[i](pre_h, name='activation')
                n_hiddens = hidden_layers[i]
                if i + 1 < len(hidden_layers):
                    n_hiddens_next = hidden_layers[i + 1]
                if logit_weights is not None and i == len(hidden_layers) - 2:
                    h *= logit_weights
            return h

        def build_dynamics_model(n_states,
                                 n_actions,
                                 n_goals,
                                 dt=None,
                                 input_rms=None,
                                 diff_rms=None):
            prediction_type = params['dynamics_model']['prediction_type']

            def dynamics_model(xgu,
                               scope_name,
                               variable_name,
                               collect_summary=False):
                '''
                :param xu: contains states, goals, actions
                :param scope_name:
                :param variable_name:
                :param dt:
                :return:
                '''
                xu, nn_input, n_inputs, n_outputs, nonlinearity, hidden_layers = \
                    prepare_input(xgu,
                                  (xgu - input_rms.mean)/input_rms.std,
                                  scope_name,
                                  variable_name,
                                  collect_summary,
                                  prediction_type)

                if "use_logit_weights" in params["dynamics_model"] and params[
                        "dynamics_model"]["use_logit_weights"]:
                    logit_weights = build_ff_neural_net(
                        nn_input, n_inputs, hidden_layers[:-1],
                        nonlinearity[:-2] + [tf.nn.sigmoid], scope_name,
                        variable_name + '_sig', collect_summary)
                else:
                    logit_weights = None
                if "dropout" in params["dynamics_model"]:
                    dropout_keep_prob = params["dynamics_model"]["dropout"]
                else:
                    dropout_keep_prob = False
                nn_output = build_ff_neural_net(nn_input,
                                                n_inputs,
                                                hidden_layers,
                                                nonlinearity,
                                                scope_name,
                                                variable_name,
                                                collect_summary,
                                                logit_weights=logit_weights,
                                                dropout=dropout_keep_prob)

                # predict the delta instead (x_next-x_current)
                if 'state_change' in prediction_type:
                    next_state = tf.add(
                        diff_rms.mean[:n_states] +
                        diff_rms.std[:n_outputs] * nn_output, xu[:, :n_states])
                else:
                    assert 'second_derivative' in prediction_type
                    # We train 'out' to match state_dot_dot
                    # Currently only works for swimmer.
                    qpos = xu[:, :n_outputs] + dt * xu[:, n_outputs:n_states]
                    qvel = xu[:, n_outputs:n_states] + dt * nn_output
                    next_state = tf.concat([qpos, qvel], axis=1)
                if '_goal' in prediction_type:
                    assert n_goals > 1
                    g = xgu[:, n_states:n_states + n_goals]
                    next_state = tf.concat([next_state, g], axis=1)
                return tf.identity(next_state,
                                   name='%s/%s/dynamics_out' %
                                   (scope_name, variable_name))

            return dynamics_model

        def get_regularizer_loss(scope_name, variable_name):
            if params['dynamics_model']['regularization']['method'] in [
                    None, ''
            ]:
                return tf.constant(0.0, dtype=tf.float32)
            constant = params['dynamics_model']['regularization']['constant']
            regularizer = eval(
                params['dynamics_model']['regularization']['method'])
            hidden_layers = params['dynamics_model']['hidden_layers']
            reg_loss = 0.0
            for i in range(len(hidden_layers) + 1):
                w = get_scope_variable(
                    scope_name, "%s/layer%d/weights" % (variable_name, i))
                b = get_scope_variable(
                    scope_name, "%s/layer%d/biases" % (variable_name, i))
                reg_loss += regularizer(w) + regularizer(b)
            return constant * reg_loss

        '''
        Main
        '''
        # with get_session() as sess:
        if variant['mode'] == 'local':
            sess = get_session(interactive=True, mem_frac=0.1)
        else:
            sess = get_session(interactive=True,
                               mem_frac=1.0,
                               use_gpu=variant['use_gpu'])

        # data = joblib.load(os.path.join(working_dir, params['trpo_path']))
        env = get_env(variant['params']['env'])

        # policy = data['policy']
        training_policy, policy_model = build_policy_from_rllab()
        if hasattr(env._wrapped_env, '_wrapped_env'):
            inner_env = env._wrapped_env._wrapped_env
        else:
            inner_env = env._wrapped_env.env.unwrapped
        n_obs = inner_env.observation_space.shape[0]
        n_actions = inner_env.action_space.shape[0]
        cost_np = inner_env.cost_np
        cost_tf = inner_env.cost_tf
        cost_np_vec = inner_env.cost_np_vec
        if hasattr(inner_env, 'n_goals'):
            n_goals = inner_env.n_goals
            n_states = inner_env.n_states
            assert n_goals + n_states == n_obs
        else:
            n_goals = 0
            n_states = n_obs
        dt = None
        # Only necessary for second_derivative
        if hasattr(inner_env, 'model') and hasattr(inner_env, 'frame_skip'):
            dt = inner_env.model.opt.timestep * inner_env.frame_skip
        from running_mean_std import RunningMeanStd
        with tf.variable_scope('input_rms'):
            input_rms = RunningMeanStd(epsilon=0.0,
                                       shape=(n_states + n_goals + n_actions))
        with tf.variable_scope('diff_rms'):
            diff_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals))
        dynamics_model = build_dynamics_model(n_states=n_states,
                                              n_actions=n_actions,
                                              n_goals=n_goals,
                                              dt=dt,
                                              input_rms=input_rms,
                                              diff_rms=diff_rms)

        kwargs = {}
        kwargs['input_rms'] = input_rms
        kwargs['diff_rms'] = diff_rms
        kwargs['mode'] = variant['mode']

        if params['algo'] == 'vpg':
            from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
            from algos.vpg import VPG
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = VPG(
                env=env,
                policy=training_policy,
                baseline=baseline,
                batch_size=policy_opt_params.vpg['batch_size'],
                max_path_length=policy_opt_params.T,
                discount=policy_opt_params.vpg['discount'],
            )
            kwargs['rllab_algo'] = algo
            if params["policy_opt_params"]["vpg"]["reset"]:
                kwargs['reset_opt'] = tf.assign(
                    training_policy._l_std_param.param,
                    np.log(params["policy_opt_params"]["vpg"]["init_std"]) *
                    np.ones(n_actions))
        elif params['algo'] == 'trpo':
            ### Write down baseline and algo
            from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
            from algos.trpo import TRPO
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = TRPO(
                env=env,
                policy=training_policy,
                baseline=baseline,
                batch_size=policy_opt_params.trpo['batch_size'],
                max_path_length=policy_opt_params.T,
                discount=policy_opt_params.trpo['discount'],
                step_size=policy_opt_params.trpo['step_size'],
            )
            kwargs['rllab_algo'] = algo
            if params["policy_opt_params"]["trpo"]["reset"]:
                kwargs['reset_opt'] = tf.assign(
                    training_policy._l_std_param.param,
                    np.log(params["policy_opt_params"]["trpo"]["init_std"]) *
                    np.ones(n_actions))
            # if "decay_rate" in params["policy_opt_params"]["trpo"]:
            #     kwargs['trpo_std_decay'] = tf.assign_sub(training_policy._l_std_param.param,
            #     np.log(params["policy_opt_params"]["trpo"]["decay_rate"])*np.ones(n_actions))
        kwargs['inner_env'] = inner_env
        kwargs['algo_name'] = params['algo']
        kwargs['logstd'] = training_policy._l_std_param.param
        # Save initial policy
        joblib.dump(training_policy,
                    os.path.join(snapshot_dir, 'params-initial.pkl'))

        train_models(env=env,
                     dynamics_model=dynamics_model,
                     dynamics_opt_params=dynamics_opt_params,
                     get_regularizer_loss=get_regularizer_loss,
                     policy_model=policy_model,
                     policy_opt_params=policy_opt_params,
                     rollout_params=rollout_params,
                     cost_np=cost_np,
                     cost_np_vec=cost_np_vec,
                     cost_tf=cost_tf,
                     snapshot_dir=snapshot_dir,
                     working_dir=working_dir,
                     n_models=params['n_models'],
                     sweep_iters=params['sweep_iters'],
                     sample_size=params['sample_size'],
                     verbose=False,
                     variant=variant,
                     saved_policy=training_policy,
                     **kwargs)  # Make sure not to reinitialize TRPO policy.

        # Save the final policy
        joblib.dump(training_policy, os.path.join(snapshot_dir, 'params.pkl'))

    except Exception as e:
        rmtree(snapshot_dir)
        import sys, traceback
        # traceback.print_exception(*sys.exc_info())
        from IPython.core.ultratb import ColorTB
        c = ColorTB()
        exc = sys.exc_info()
        print(''.join(c.structured_traceback(*exc)))
        print('Removed the experiment folder %s.' % snapshot_dir)
コード例 #3
0
# policies
if load_policy is None:
    policy_list = [BMAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=fast_lr,
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
        particle_idx=n) 
        for n in range(num_particles)]
else:  # will be loaded
    policy_list = [0]*num_particles

# baseline
baseline_list = [LinearFeatureBaseline(env_spec=env.spec) for n in range(num_particles)]

# meta learning methods
if meta_method == 'chaser':
    algo = BMAMLCHASER(
        env=env,
        policy_list=policy_list,
        baseline_list=baseline_list,
        batch_size=fast_batch_size, # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        num_leader_grad_updates=num_leader_grad_updates,
        random_seed=random_seed,
        svpg=svpg,
        svpg_alpha=svpg_alpha,
コード例 #4
0
ファイル: vpg_2.py プロジェクト: hl00/maml_rl
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.envs.normalized_env import normalize
import numpy as np
import theano
import theano.tensor as TT
from lasagne.updates import adam

# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(CartpoleEnv())
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, ))
# #使用默认手工设置的特征,初始化线性基线估计器
baseline = LinearFeatureBaseline(env.spec)

# We will collect 100 trajectories per iteration
N = 100
# Each trajectory will have at most 100 time steps
T = 100
# Number of iterations
n_itr = 100
# Set the discount factor for the problem
discount = 0.99
# Learning rate for the gradient update
learning_rate = 0.1

# Construct the computation graph

# Create a Theano variable for storing the observations
コード例 #5
0
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=0.003,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            epsilon=0.1,
            train_pi_iters=10,
            use_skill_dependent_baseline=False,
            mlp_skill_dependent_baseline=False,
            freeze_manager=False,
            freeze_skills=False,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                                 max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(ConcurrentContinuousPPO,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        self.period = self.policy.period
        assert self.policy.period == self.period
        self.continuous_latent = self.policy.continuous_latent
        assert self.continuous_latent
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[
                0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi *
                                             (self.num_latents + 1) +
                                             self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space,
                                               skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(
                    env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(
                    env_spec=skill_dependent_env_spec)
コード例 #6
0
def run_task(*_):
    env = normalize(GymEnv(args.env))
    # env.wrapped_env.env.env.env.reward_flag = 'absolute'
    env.wrapped_env.env.env.reward_flag = args.reward

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    learn_std = True
    init_std = 2

    # hidden_sizes=(8,)
    hidden_sizes = (32, 32)
    # hidden_sizes=(100, 50, 25)

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=hidden_sizes,
                               learn_std=learn_std,
                               init_std=init_std)

    # =======================
    # Defining the algorithm
    # =======================
    batch_size = 5000
    n_itr = args.n_itr
    gamma = .9
    step_size = 0.01

    if args.algorithm == 0:
        algo = VPG(env=env,
                   policy=policy,
                   baseline=baseline,
                   batch_size=batch_size,
                   n_itr=n_itr,
                   discount=gamma,
                   step_size=step_size)
    if args.algorithm == 1:
        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    if args.algorithm == 2:
        algo = TNPG(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    # if args.algorithm == 4:
    # algo = DDPG(
    # env=env,
    # policy=policy,
    # baseline=baseline,
    # batch_size=batch_size,
    # n_itr=n_itr,
    # discount=gamma,
    # step_size=step_size
    # )
    algo.train()

    return algo
コード例 #7
0
def run_task(*_):
    v_enter = 30
    inner_length = 800
    long_length = 100
    short_length = 800
    n = 1
    m = 5
    num_cars_left = 3
    num_cars_right = 3
    num_cars_top = 15
    num_cars_bot = 15
    tot_cars = (num_cars_left + num_cars_right) * m \
        + (num_cars_bot + num_cars_top) * n

    grid_array = {
        "short_length": short_length,
        "inner_length": inner_length,
        "long_length": long_length,
        "row_num": n,
        "col_num": m,
        "cars_left": num_cars_left,
        "cars_right": num_cars_right,
        "cars_top": num_cars_top,
        "cars_bot": num_cars_bot
    }

    sumo_params = SumoParams(sim_step=1, sumo_binary="sumo-gui")

    vehicles = Vehicles()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SumoCarFollowingController, {}),
                 sumo_car_following_params=SumoCarFollowingParams(
                     minGap=2.5,
                     max_speed=v_enter,
                 ),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars,
                 speed_mode="all_checks")

    additional_env_params = {
        "target_velocity": 50,
        "num_steps": 500,
        "control-length": 150,
        "switch_time": 3.0
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1,
        "traffic_lights": True
    }

    initial_config, net_params = get_non_flow_params(10, additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  generator_class=SimpleGridGenerator,
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config)

    env_name = "GreenWaveEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
コード例 #8
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']
    samples_per_cell = 10  # for the oracle rejection sampling

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug"
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'],
                                                    bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    plot_policy_means(policy,
                      env,
                      sampling_res=sampling_res,
                      report=report,
                      limit=v['goal_range'],
                      center=v['goal_center'])
    test_and_plot_policy(policy,
                         env,
                         as_goals=False,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         center=v['goal_center'],
                         limit=v['goal_range'])
    report.new_row()

    all_starts = StateCollection(distance_threshold=v['coll_eps'])

    # Use asymmetric self-play to run Alice to generate starts for Bob.
    # Use a double horizon because the horizon is shared between Alice and Bob.
    env_alice = AliceEnv(env_alice=env,
                         env_bob=env,
                         policy_bob=policy,
                         max_path_length=v['alice_horizon'],
                         alice_factor=v['alice_factor'],
                         alice_bonus=v['alice_bonus'],
                         gamma=1,
                         stop_threshold=v['stop_threshold'])

    policy_alice = GaussianMLPPolicy(
        env_spec=env_alice.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain_alice'],
        init_std=v['policy_init_std_alice'],
    )
    baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec)

    algo_alice = TRPO(
        env=env_alice,
        policy=policy_alice,
        baseline=baseline_alice,
        batch_size=v['pg_batch_size_alice'],
        max_path_length=v['alice_horizon'],
        n_itr=v['inner_iters_alice'],
        step_size=0.01,
        discount=v['discount_alice'],
        plot=False,
    )

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        starts, t_alices = generate_starts_alice(
            env_alice=env_alice,
            algo_alice=algo_alice,
            start_states=[v['start_goal']],
            num_new_starts=v['num_new_starts'],
            log_dir=log_dir)

        labels = label_states(starts,
                              env,
                              policy,
                              v['horizon'],
                              as_goals=False,
                              n_traj=v['n_traj'],
                              key='goal_reached')
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'],
                            summary_string_base='initial starts labels:\n')
        report.save()

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=v['step_size'],
                discount=v['discount'],
                plot=False,
            )

            # We don't use these labels anyway, so we might as well take them from training.
            #trpo_paths = algo.train()
            algo.train()

        # logger.log("labeling starts with trpo rollouts")
        # [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached',  # using the min n_traj
        #                                            as_goal=False, env=env)
        # paths = [path for paths in trpo_paths for path in paths]

        with logger.tabular_prefix('Outer_'):
            logger.record_tabular('t_alices', np.mean(t_alices))

        logger.log('Generating the Heatmap...')
        plot_policy_means(policy,
                          env,
                          sampling_res=sampling_res,
                          report=report,
                          limit=v['goal_range'],
                          center=v['goal_center'])
        test_and_plot_policy(policy,
                             env,
                             as_goals=False,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             center=v['goal_center'],
                             limit=v['goal_range'])

        logger.log("Labeling the starts")
        labels = label_states(starts,
                              env,
                              policy,
                              v['horizon'],
                              as_goals=False,
                              n_traj=v['n_traj'],
                              key='goal_reached')

        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new states to list of all starts (replay buffer): Not the low reward ones!!
        filtered_raw_starts = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]

        if len(
                filtered_raw_starts
        ) == 0:  # add a tone of noise if all the states I had ended up being high_reward!
            logger.log("Bad Alice!  All goals are high reward!")

        #     seed_starts = filtered_raw_starts
        # else:
        #     seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'],
        #                                   variance=v['brownian_variance'] * 10)
        all_starts.append(filtered_raw_starts)
コード例 #9
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    sim_params = SumoParams(sim_step=0.1, render=True)

    vehicles = VehicleParams()
    vehicles.add(
        veh_id="rl",
        acceleration_controller=(RLController, {}),
        routing_controller=(ContinuousRouter, {}),
        car_following_params=SumoCarFollowingParams(
            speed_mode="obey_safe_speed",
        ),
        num_vehicles=1)
    vehicles.add(
        veh_id="idm",
        acceleration_controller=(IDMController, {
            "noise": 0.2
        }),
        routing_controller=(ContinuousRouter, {}),
        car_following_params=SumoCarFollowingParams(
            speed_mode="obey_safe_speed",
        ),
        num_vehicles=13)

    additional_env_params = {
        "target_velocity": 20,
        "max_accel": 3,
        "max_decel": 3,
        "sort_vehicles": False
    }
    env_params = EnvParams(
        horizon=HORIZON, additional_params=additional_env_params)

    additional_net_params = {
        "radius_ring": 30,
        "lanes": 1,
        "speed_limit": 30,
        "resolution": 40
    }
    net_params = NetParams(
        no_internal_links=False, additional_params=additional_net_params)

    initial_config = InitialConfig(spacing="uniform")

    print("XXX name", exp_tag)
    scenario = Figure8Scenario(
        exp_tag,
        vehicles,
        net_params,
        initial_config=initial_config)

    env_name = "AccelEnv"
    pass_params = (env_name, sim_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(16, 16))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=15000,
        max_path_length=horizon,
        n_itr=500,
        # whole_paths=True,
        discount=0.999,
        # step_size=v["step_size"],
    )
    algo.train(),
コード例 #10
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    sumo_params = SumoParams(sim_step=0.1, render=False, seed=0)

    vehicles = Vehicles()
    vehicles.add(veh_id="rl",
                 acceleration_controller=(RLController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=1)
    vehicles.add(veh_id="idm",
                 acceleration_controller=(IDMController, {}),
                 routing_controller=(ContinuousRouter, {}),
                 num_vehicles=21)

    additional_env_params = {
        "target_velocity": 8,
        "ring_length": [220, 270],
        "max_accel": 1,
        "max_decel": 1
    }
    env_params = EnvParams(horizon=HORIZON,
                           additional_params=additional_env_params,
                           warmup_steps=750)

    additional_net_params = {
        "length": 260,
        "lanes": 1,
        "speed_limit": 30,
        "resolution": 40
    }
    net_params = NetParams(additional_params=additional_net_params)

    initial_config = InitialConfig(spacing="uniform", bunching=50)

    print("XXX name", exp_tag)
    scenario = LoopScenario(exp_tag,
                            vehicles,
                            net_params,
                            initial_config=initial_config)

    env_name = "WaveAttenuationPOEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianGRUPolicy(
        env_spec=env.spec,
        hidden_sizes=(5, ),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=3600 * 72 * 2,
        max_path_length=horizon,
        n_itr=5,
        # whole_paths=True,
        discount=0.999,
        # step_size=v["step_size"],
    )
    algo.train(),
コード例 #11
0
def experiment_compare_scratch_100():
    # k = 100

    for seed in range(1, 10):
        env = StandardControllerEnv(k=4,
                                    seed=seed,
                                    noise=0.05,
                                    num_dynamics=4,
                                    num_points=100)
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(
                32,
                32,
            ),
        )
        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1000,
            max_path_length=env.horizon,
            n_itr=100,
            discount=0.995,
            step_size=0.001,
            plot=False,
        )
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # script="scripts/run_experiment_lite_rl.py",
            script="scripts/run_experiment_lite.py",
            exp_name=os.path.join("Baseline %d" % seed, timestamp),
            log_dir=os.path.join(
                "Results/Controls/Seed_Baseline/Baseline/%d" % seed, timestamp)
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )

        env = ControllerEnv(k=4,
                            seed=seed,
                            noise=0.05,
                            num_dynamics=4,
                            num_points=100)
        now = datetime.datetime.now()
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

        policy = CategoricalMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(
                32,
                32,
            ),
        )
        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1000,
            max_path_length=env.horizon,
            n_itr=100,
            discount=0.995,
            step_size=0.001,
            plot=False,
        )
        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # script="scripts/run_experiment_lite_rl.py",
            script="scripts/run_experiment_lite.py",
            exp_name=os.path.join("Meta %d" % seed, timestamp),
            log_dir=os.path.join(
                "Results/Controls/Seed_Baseline/Meta/%d" % seed, timestamp)
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            # plot=True,
        )
コード例 #12
0
def run_task(_):
    """Implement the run_task method needed to run experiments with rllab."""
    sumo_params = SumoParams(
        render=True, sim_step=0.2, restart_instance=True)

    # RL vehicles constitute 5% of the total number of vehicles
    vehicles = Vehicles()
    vehicles.add(
        veh_id="human",
        acceleration_controller=(IDMController, {
            "noise": 0.2
        }),
        speed_mode="no_collide",
        num_vehicles=5)
    vehicles.add(
        veh_id="rl",
        acceleration_controller=(RLController, {}),
        speed_mode="no_collide",
        num_vehicles=0)

    # Vehicles are introduced from both sides of merge, with RL vehicles
    # entering from the highway portion as well
    inflow = InFlows()
    inflow.add(
        veh_type="human",
        edge="inflow_highway",
        vehs_per_hour=(1 - RL_PENETRATION) * FLOW_RATE,
        departLane="free",
        departSpeed=10)
    inflow.add(
        veh_type="rl",
        edge="inflow_highway",
        vehs_per_hour=RL_PENETRATION * FLOW_RATE,
        departLane="free",
        departSpeed=10)
    inflow.add(
        veh_type="human",
        edge="inflow_merge",
        vehs_per_hour=100,
        departLane="free",
        departSpeed=7.5)

    additional_env_params = {
        "target_velocity": 25,
        "num_rl": NUM_RL,
        "max_accel": 1.5,
        "max_decel": 1.5
    }
    env_params = EnvParams(
        horizon=HORIZON,
        sims_per_step=5,
        warmup_steps=0,
        additional_params=additional_env_params)

    additional_net_params = ADDITIONAL_NET_PARAMS.copy()
    additional_net_params["merge_lanes"] = 1
    additional_net_params["highway_lanes"] = 1
    additional_net_params["pre_merge_length"] = 500
    net_params = NetParams(
        inflows=inflow,
        no_internal_links=False,
        additional_params=additional_net_params)

    initial_config = InitialConfig(
        spacing="uniform", lanes_distribution=float("inf"))

    scenario = MergeScenario(
        name="merge-rl",
        vehicles=vehicles,
        net_params=net_params,
        initial_config=initial_config)

    env_name = "WaveAttenuationMergePOEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    env = normalize(env)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32, 32),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=HORIZON * N_ROLLOUTS,
        max_path_length=HORIZON,
        n_itr=1000,
        # whole_paths=True,
        discount=0.999,
    )
    algo.train(),
コード例 #13
0
ファイル: local_test.py プロジェクト: Neo-X/GMPS
def experiment(variant):


    seed = variant['seed'] ;  log_dir = variant['log_dir']  ; n_parallel = variant['n_parallel']

    setup(seed, n_parallel , log_dir)

    init_file = variant['init_file'] ; taskIndex = variant['taskIndex'] 
    n_itr = variant['n_itr'] ; default_step = variant['default_step']
    policyType = variant['policyType'] ; envType = variant['envType']

    tasksFile = path_to_multiworld+'/multiworld/envs/goals/' + variant['tasksFile']+'.pkl'
    tasks = pickle.load(open(tasksFile, 'rb'))

    max_path_length = variant['max_path_length']
 
    use_images = 'conv' in policyType


    if 'MultiDomain' in envType:
        baseEnv = Sawyer_MultiDomainEnv(tasks = tasks , image = use_images , mpl = max_path_length)

    elif 'Push' in envType:   
        baseEnv = SawyerPushEnv(tasks = tasks , image = use_images , mpl = max_path_length)
       

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv( tasks = tasks , image = use_images , mpl = max_path_length)
       
    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks = tasks , image = use_images , mpl = max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'Coffee' in envType:
        baseEnv = SawyerCoffeeEnv(mpl = max_path_length)

    else:
        raise AssertionError('')

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx')))


    baseline = ZeroBaseline(env_spec=env.spec)
    #baseline = LinearFeatureBaseline(env_spec = env.spec)
    batch_size = variant['batch_size']


    if policyType == 'fullAda_Bias':
    
        baseline = LinearFeatureBaseline(env_spec = env.spec)
        algo = vpg_fullADA(
            env=env,
            policy=None,
            load_policy = init_file,
            baseline=baseline,
            batch_size = batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            #noise_opt = True,
            default_step = default_step,
            sampler_cls=VectorizedSampler, # added by RK 6/19
            sampler_args = dict(n_envs=1),
               
            #reset_arg=np.asscalar(taskIndex),
            reset_arg = taskIndex,
            log_dir = log_dir
        )

    elif policyType == 'biasAda_Bias':

        algo = vpg_biasADA(
            env=env,
            policy=None,
            load_policy = init_file, 
            baseline=baseline,
            batch_size= batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            #noise_opt = True,
            default_step = default_step,
            sampler_cls=VectorizedSampler, # added by RK 6/19
            sampler_args = dict(n_envs=1),
            #reset_arg=np.asscalar(taskIndex),
            reset_arg = taskIndex,
            log_dir = log_dir
        )

    elif policyType == 'basic':

        algo = vpg_basic(
                env=env,
                policy=None,
                load_policy=init_file,
                baseline=baseline,
                batch_size=batch_size,
                max_path_length=max_path_length,
                n_itr=n_itr,
                #step_size=10.0,
                sampler_cls=VectorizedSampler, # added by RK 6/19
                sampler_args = dict(n_envs=1),
               
                reset_arg=taskIndex,
                optimizer=None,
                optimizer_args={'init_learning_rate': default_step, 'tf_optimizer_args': {'learning_rate': 0.5*default_step}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer},
                log_dir = log_dir
                # extra_input="onehot_exploration", # added by RK 6/19
                # extra_input_dim=5, # added by RK 6/19 
            )


    elif 'conv' in policyType:

        algo = vpg_conv(
            env=env,
            policy=None,
            load_policy = init_file, 
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            sampler_cls=VectorizedSampler, # added by RK 6/19
            sampler_args = dict(n_envs=1),
            #noise_opt = True,
            default_step = default_step,
            #reset_arg=np.asscalar(taskIndex),
            reset_arg = taskIndex,
            log_dir = log_dir

        )
          
    else:
        raise AssertionError('Policy Type must be fullAda_Bias or biasAda_Bias')

    algo.train()
コード例 #14
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    if log_dir is None:
        log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug"
        debug = True
    else:
        debug = False

    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(AntMazeEnv(maze_id=v['maze_id']))

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    outer_iter = 0
    if not debug and not v['fast_mode']:
        logger.log('Generating the Initial Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'])

    report.new_row()

    sagg_riac = SaggRIAC(state_size=v['goal_size'],
                         state_range=v['goal_range'],
                         state_center=v['goal_center'],
                         max_goals=v['max_goals'],
                         max_history=v['max_history'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)

        raw_goals = sagg_riac.sample_states(num_samples=v['num_new_goals'])

        goals = raw_goals

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals,
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            all_paths = algo.train()

        if v['use_competence_ratio']:
            [goals, rewards
             ] = compute_rewards_from_paths(all_paths,
                                            key='competence',
                                            as_goal=True,
                                            env=env,
                                            terminal_eps=v['terminal_eps'])
        else:
            [goals, rewards] = compute_rewards_from_paths(all_paths,
                                                          key='rewards',
                                                          as_goal=True,
                                                          env=env)

        [goals_with_labels,
         labels] = label_states_from_paths(all_paths,
                                           n_traj=v['n_traj'],
                                           key='goal_reached')
        plot_labeled_states(goals_with_labels,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'])

        sagg_riac.plot_regions_interest(maze_id=v['maze_id'], report=report)
        sagg_riac.plot_regions_states(maze_id=v['maze_id'], report=report)

        logger.log("Updating SAGG-RIAC")
        sagg_riac.add_states(goals, rewards)

        # Find final states "accidentally" reached by the agent.
        final_goals = compute_final_states_from_paths(all_paths,
                                                      as_goal=True,
                                                      env=env)
        sagg_riac.add_accidental_states(final_goals, v['extend_dist_rew'])

        logger.dump_tabular(with_prefix=False)
        report.new_row()
コード例 #15
0
def run_task(v):
    env, _ = create_env(v["which_agent"])
    fw_learning_rate = v['fw_learning_rate']  # 0.0005!

    yaml_path = os.path.abspath('yaml_files/' + v['yaml_file'] + '.yaml')
    assert (os.path.exists(yaml_path))
    with open(yaml_path, 'r') as f:
        params = yaml.load(f)
    num_fc_layers = params['dyn_model']['num_fc_layers']
    depth_fc_layers = params['dyn_model']['depth_fc_layers']
    batchsize = params['dyn_model']['batchsize']
    lr = params['dyn_model']['lr']
    print_minimal = v['print_minimal']
    nEpoch = params['dyn_model']['nEpoch']
    save_dir = os.path.join(args.save_dir, v['exp_name'])
    inputSize = env.spec.action_space.flat_dim + env.spec.observation_space.flat_dim
    outputSize = env.spec.observation_space.flat_dim

    #Initialize the forward policy
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    #learn_std=False, #v['learn_std'],
    #adaptive_std=False, #v['adaptive_std'],
    #output_gain=1, #v['output_gain'],
    #init_std=1) #v['polic)
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    #Update function for the forward policy (immitation learning loss!)
    fwd_obs = TT.matrix('fwd_obs')
    fwd_act_out = TT.matrix('act_out')
    policy_dist = policy.dist_info_sym(fwd_obs)
    fw_loss = -TT.sum(
        policy.distribution.log_likelihood_sym(fwd_act_out, policy_dist))
    fw_params = policy.get_params_internal()
    fw_update = lasagne.updates.adam(fw_loss,
                                     fw_params,
                                     learning_rate=fw_learning_rate)
    fw_func = theano.function([fwd_obs, fwd_act_out],
                              fw_loss,
                              updates=fw_update,
                              allow_input_downcast=True)
    log_dir = v['yaml_file']
    print('Logging Tensorboard to: %s' % log_dir)
    hist_logger = hist_logging(log_dir)

    optimizer_params = dict(base_eps=1e-5)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.makedirs(save_dir + '/losses')
        os.makedirs(save_dir + '/models')
        os.makedirs(save_dir + '/saved_forwardsim')
        os.makedirs(save_dir + '/saved_trajfollow')
        os.makedirs(save_dir + '/training_data')

    x_index, y_index, z_index, yaw_index,\
    joint1_index, joint2_index, frontleg_index,\
    frontshin_index, frontfoot_index, xvel_index, orientation_index = get_indices(v['which_agent'])
    dyn_model = Bw_Trans_Model(inputSize, outputSize, env, v, lr, batchsize,
                               v['which_agent'], x_index, y_index,
                               num_fc_layers, depth_fc_layers, print_minimal)

    for outer_iter in range(1, v['outer_iters']):

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=v["batch_size"],
            max_path_length=v["steps_per_rollout"],
            n_itr=v["num_trpo_iters"],
            discount=0.995,
            optimizer=v["ConjugateGradientOptimizer"](
                hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)),
            step_size=0.05,
            plot_true=True)
        all_paths = algo.train()

        #Collect the trajectories, using these trajectories which leads to high value states
        # learn a backwards model!
        observations_list = []
        actions_list = []
        rewards_list = []
        returns_list = []
        for indexing in all_paths:
            for paths in indexing:
                observations = []
                actions = []
                returns = []
                reward_for_rollout = 0
                for i_ in range(len(paths['observations'])):
                    #since, we are building backwards model using trajectories,
                    #so, reversing the trajectories.
                    index_ = len(paths['observations']) - i_ - 1
                    observations.append(paths['observations'][index_])
                    actions.append(paths['actions'][index_])
                    returns.append(paths['returns'][index_])
                    reward_for_rollout += paths['rewards'][index_]
                    #if something_ == 1:
                    #    actions_bw.append(path['actions'][::-1])
                    #    observations_bw.append(path['observations'][::-1])
                observations_list.append(observations)
                actions_list.append(actions)
                rewards_list.append(reward_for_rollout)
                returns_list.append(returns)

        hist_logger.log_scalar(save_dir,
                               np.sum(rewards_list) / len(rewards_list),
                               outer_iter * v["num_trpo_iters"])
        selected_observations_list = []
        selected_observations_list_for_state_seletection = []
        selected_actions_list = []
        selected_returns_list = []

        #Figure out how to build the backwards model.
        #Conjecture_1
        #------- Take quantile sample of trajectories which recieves highest cumulative rewards!

        number_of_trajectories = int(
            np.floor(v['top_k_trajectories'] * len(rewards_list) / 100))
        rewards_list_np = np.asarray(rewards_list)
        trajectory_indices = rewards_list_np.argsort(
        )[-number_of_trajectories:][::-1]
        for index_ in range(len(trajectory_indices)):
            selected_observations_list.append(
                observations_list[trajectory_indices[index_]])
            selected_actions_list.append(
                actions_list[trajectory_indices[index_]])

        selected_observations_list_for_state_selection = []
        number_of_trajectories = int(
            np.floor(v['top_k_trajectories_state_selection'] *
                     len(rewards_list) / 100))
        rewards_list_np = np.asarray(rewards_list)
        trajectory_indices = rewards_list_np.argsort(
        )[-number_of_trajectories:][::-1]
        for index_ in range(len(trajectory_indices)):
            selected_observations_list_for_state_seletection.append(
                observations_list[trajectory_indices[index_]])
            selected_returns_list.append(
                returns_list[trajectory_indices[index_]])

        #Figure out from where to start the backwards model.
        #Conjecture_1
        #------ Take quantile sample of high value states, and start the backwards model from them!
        #which amounts to just taking a non parametric buffer of high values states, which should be
        #fine!

        if v['use_good_trajectories'] == 1:
            returns_list = selected_returns_list
            observations_list = selected_observations_list_for_state_selection

        flatten_ret_list = np.asarray(returns_list).flatten()
        flatten_obs_list = np.vstack(np.asarray(observations_list))
        number_of_bw_samples = int(
            np.floor(v['top_k_bw_samples'] * len(flatten_ret_list) / 100))
        samples_indices = flatten_ret_list.argsort(
        )[-number_of_bw_samples:][::-1]
        bw_samples = []
        for bw_index in range(len(samples_indices)):
            bw_samples.append(flatten_obs_list[samples_indices[bw_index]])

        #Not all parts of the state are actually used.
        states = from_observation_to_usablestate(selected_observations_list,
                                                 v["which_agent"], False)
        controls = selected_actions_list
        dataX, dataY = generate_training_data_inputs(states, controls)
        states = np.asarray(states)
        dataZ = generate_training_data_outputs(states, v['which_agent'])

        #every component (i.e. x position) should become mean 0, std 1
        dataX, mean_x, std_x = zero_mean_unit_std(dataX)
        dataY, mean_y, std_y = zero_mean_unit_std(dataY)
        dataZ, mean_z, std_z = zero_mean_unit_std(dataZ)

        ## concatenate state and action, to be used for training dynamics
        inputs = np.concatenate((dataX, dataY), axis=1)
        outputs = np.copy(dataZ)
        assert inputs.shape[0] == outputs.shape[0]

        if v['num_imagination_steps'] == 10:
            nEpoch = 20
        elif v['num_imagination_steps'] == 50:
            nEpoch = 20
        elif v['num_imagination_steps'] == 100:
            nEpoch = 30
        else:
            nEpoch = 20

        nEpoch = v['nEpoch']

        training_loss = dyn_model.train(inputs, outputs, inputs, outputs,
                                        nEpoch, save_dir, 1)
        print("Training Loss for Backwards model", training_loss)

        if v['running_baseline'] == False:
            for goal_ind in range(min(v['fw_iter'], len(bw_samples))):
                #train the backwards model
                #Give inital state, perform rollouts from backwards model.Right now, state is random, but it should
                #be selected from some particular list
                forwardsim_x_true = bw_samples[goal_ind]
                state_list, action_list = dyn_model.do_forward_sim(
                    forwardsim_x_true, v['num_imagination_steps'], False, env,
                    v['which_agent'], mean_x, mean_y, mean_z, std_x, std_y,
                    std_z)

                #Incorporate the backwards trace into model based system.
                fw_func(np.vstack(state_list), np.vstack(action_list))
                #print("Immitation Learning loss", loss)
        else:
            print('running TRPO baseline')
コード例 #16
0
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--fw_ratio', type=float, default=0.1)
    parser.add_argument('--init_lr', type=float, default=5e-4)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=0.99)

    args = parser.parse_args()

    # Param ranges
    seeds = range(2)

    for seed in seeds:
        mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v12',record_video=False, \
        log_dir='/tmp/gym_test',record_log=False)))

        name = 'trpo-state-v12-tf-icm-fw{}-initlr-{}'.format(
            args.fw_ratio, args.init_lr)

        policy = GaussianMLPPolicy(
            "mlp_policy",
            env_spec=mdp.spec,
            hidden_sizes=(64, 64, 32),
            output_nonlinearity=tf.nn.tanh,
            clip_action=False,
        )

        baseline = LinearFeatureBaseline(mdp.spec, )

        batch_size = 50000
        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            whole_paths=True,
            max_path_length=1000,
            n_itr=2000,
            step_size=0.01,
            subsample_factor=1.0,
            sampler_cls=BatchSampler,
        )

        algorithm = ICM(
            mdp,
            algo,
            args.tfboard_path + "/%s_%d" % (name, seed),
            feature_dim=mdp.spec.observation_space.flat_dim,
            forward_weight=args.fw_ratio,
            external_reward_weight=0.0,
            replay_pool_size=1000000,
            init_learning_rate=args.init_lr,
            n_updates_per_iter=1000,
        )

        run_experiment_lite(algorithm.train(),
                            exp_prefix=name,
                            n_parallel=8,
                            snapshot_mode="gap",
                            snapshot_gap=200,
                            seed=seed,
                            mode="local")
コード例 #17
0
def main():
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # create the environment
    env = _create_env(args)

    # create expert data
    expert_data_T, expert_data_V = _create_expert_data(args)
    expert_data = dict(
            train = expert_data_T,
            valid = expert_data_V
            )

    # create policy
    policy, init_ops = _create_policy(args, env)

    # create auxiliary networks (invdyn, reward, variational posterior)
    invdyn_model, reward_model, info_model, env = _create_aux_networks(args, env)

    # create baseline
    if args.baseline_type == "linear":
        baseline = LinearFeatureBaseline(env_spec=None)
    else:
        assert False

    # use date and time to create new logging directory for each run
    date= calendar.datetime.date.today().strftime('%y-%m-%d')
    if date not in os.listdir(model_path):
        os.mkdir(model_path+'/'+date)

    c = 0
    exp_name = '{}-'.format(args.exp_name) + str(c)

    while exp_name in os.listdir(model_path+'/'+date+'/'):
        c += 1
        exp_name = '{}-'.format(args.exp_name)+str(c)

    exp_dir = date+'/'+exp_name
    log_dir = osp.join(config.LOG_DIR, exp_dir)

    policy.set_log_dir(log_dir)
    if info_model is not None:
        info_model.set_log_dir(log_dir)

    _create_log(args)

    # run GAIL algorithm
    models = {"policy":policy, "info":info_model, "reward":reward_model}
    bpo_args = dict(
        n_itr=args.n_itr,
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.trpo_batch_size,
        max_path_length=args.max_path_length,
        discount=args.discount,
        step_size=args.trpo_step_size,
        force_batch_sampler=True,
        whole_paths=True,
        init_ops=init_ops,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)),
        save_models=[models[model_name] for model_name in args.save_models]
        )
    vae_args = dict(
            kl_weight=args.kl_weight,
            )
    curriculum = dict(
            start = args.curr_start,
            add = args.curr_add,
            step = args.curr_step
            )
    if not args.model_all : curriculum = {}
    kwargs = {k:v for k, v in bpo_args.items() + vae_args.items()}
    algo = GAIL(
                args.exp_name,
                exp_name,
                expert_data,
                reward_model,
                args.gail_batch_size,
                invdyn_model=invdyn_model,
                info_model=info_model,
                debug=args.debug,
                model_all=args.model_all,
                curriculum=curriculum,
                rew_aug=args.rew_aug,
                use_replay_buffer=args.use_replay_buffer,
                **kwargs
                )

    runner = RLLabRunner(algo, args, exp_dir)
    runner.train()
コード例 #18
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    sim_params = SumoParams(sim_step=0.2, render=True)

    # note that the vehicles are added sequentially by the scenario,
    # so place the merging vehicles after the vehicles in the ring
    vehicles = VehicleParams()
    # Inner ring vehicles
    vehicles.add(
        veh_id="human",
        acceleration_controller=(IDMController, {
            "noise": 0.2
        }),
        lane_change_controller=(SimLaneChangeController, {}),
        routing_controller=(ContinuousRouter, {}),
        num_vehicles=6,
        car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5),
        lane_change_params=SumoLaneChangeParams())

    # A single learning agent in the inner ring
    vehicles.add(
        veh_id="rl",
        acceleration_controller=(RLController, {}),
        lane_change_controller=(SimLaneChangeController, {}),
        routing_controller=(ContinuousRouter, {}),
        num_vehicles=1,
        car_following_params=SumoCarFollowingParams(
            minGap=0.01,
            tau=0.5,
            speed_mode="obey_safe_speed"
        ),
        lane_change_params=SumoLaneChangeParams())

    # Outer ring vehicles
    vehicles.add(
        veh_id="merge-human",
        acceleration_controller=(IDMController, {
            "noise": 0.2
        }),
        lane_change_controller=(SimLaneChangeController, {}),
        routing_controller=(ContinuousRouter, {}),
        num_vehicles=10,
        car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5),
        lane_change_params=SumoLaneChangeParams())

    env_params = EnvParams(
        horizon=HORIZON,
        additional_params={
            "target_velocity": 10,
            "max_accel": 3,
            "max_decel": 3,
            "sort_vehicles": False
        })

    additional_net_params = ADDITIONAL_NET_PARAMS.copy()
    additional_net_params["ring_radius"] = 50
    additional_net_params["inner_lanes"] = 1
    additional_net_params["outer_lanes"] = 1
    additional_net_params["lane_length"] = 75
    net_params = NetParams(
        no_internal_links=False, additional_params=additional_net_params)

    initial_config = InitialConfig(x0=50, spacing="uniform")

    scenario = TwoLoopsOneMergingScenario(
        name=exp_tag,
        vehicles=vehicles,
        net_params=net_params,
        initial_config=initial_config)

    env_name = "AccelEnv"
    pass_params = (env_name, sim_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=64 * 3 * horizon,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=1000,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
コード例 #19
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    v_enter = 10
    inner_length = 300
    long_length = 100
    short_length = 300
    n = 3
    m = 3
    num_cars_left = 1
    num_cars_right = 1
    num_cars_top = 1
    num_cars_bot = 1
    tot_cars = (num_cars_left + num_cars_right) * m \
        + (num_cars_bot + num_cars_top) * n

    grid_array = {
        "short_length": short_length,
        "inner_length": inner_length,
        "long_length": long_length,
        "row_num": n,
        "col_num": m,
        "cars_left": num_cars_left,
        "cars_right": num_cars_right,
        "cars_top": num_cars_top,
        "cars_bot": num_cars_bot
    }

    sumo_params = SumoParams(sim_step=1, render=True)

    vehicles = Vehicles()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SumoCarFollowingController, {}),
                 sumo_car_following_params=SumoCarFollowingParams(
                     min_gap=2.5, tau=1.1, max_speed=v_enter),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars,
                 speed_mode="all_checks")

    tl_logic = TrafficLights(baseline=False)

    additional_env_params = {
        "target_velocity": 50,
        "switch_time": 3.0,
        "num_observed": 2,
        "discrete": False,
        "tl_type": "controlled"
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1
    }

    initial_config, net_params = get_flow_params(10, 300, n, m,
                                                 additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config,
                                  traffic_lights=tl_logic)

    env_name = "PO_TrafficLightGridEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
コード例 #20
0
def experiment(variant):

    seed = variant['seed']

    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    fast_learning_rate = variant['flr']

    fast_batch_size = variant[
        'fbs']  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
    meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 150
    num_grad_updates = 1
    meta_step_size = variant['mlr']

    regionSize = variant['regionSize']

    if regionSize == '20X20':

        tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_6_8.pkl'

    else:
        assert regionSize == '60X30'

        tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_60X30.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))

    envType = variant['envType']

    if envType == 'Push':

        baseEnv = SawyerPushEnv(tasks=tasks)
    else:
        assert (envType) == 'PickPlace'

        baseEnv = SawyerPickPlaceEnv(tasks=tasks)
    env = FinnMamlEnv(
        FlatGoalEnv(baseEnv,
                    obs_keys=['state_observation', 'state_desired_goal']))

    env = TfEnv(NormalizedBoxEnv(env))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=None,
        load_policy=variant['init_param_file'],
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        n_itr=1000,
        use_maml=True,
        step_size=meta_step_size,
        plot=False,
    )

    import os

    saveDir = variant['saveDir']

    if os.path.isdir(saveDir) == False:
        os.mkdir(saveDir)

    logger.set_snapshot_dir(saveDir)
    logger.add_tabular_output(saveDir + 'progress.csv')

    algo.train()
# Param ranges
seeds = range(5)

for seed in seeds:
    mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))
    
    policy = GaussianMLPPolicy(
        "mlp_policy",
        env_spec=mdp.spec,
        hidden_sizes=(64, 64, 32),
        output_nonlinearity=tf.nn.tanh,
    )

    baseline = LinearFeatureBaseline(
        mdp.spec,
    )

    batch_size = 50000
    algo = TRPO(
        env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        sampler_cls=BatchSampler,
    )
コード例 #22
0
ファイル: local_train_sequence.py プロジェクト: Neo-X/GMPS
def experiment(variant, comet_logger=comet_logger):

    from sandbox.rocky.tf.algos.maml_il import MAMLIL
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline
    from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.misc.instrument import stub, run_experiment_lite
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy
    #from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import MAMLGaussianMLPPolicy as biasAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy

    from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer
    from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
    from sandbox.rocky.tf.envs.base import TfEnv
    import sandbox.rocky.tf.core.layers as L

    from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing
    from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv
    from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.finn_maml_env import FinnMamlEnv
    from multiworld.core.wrapper_env import NormalizedBoxEnv

    import tensorflow as tf
    import time
    from rllab.envs.gym_env import GymEnv

    from maml_examples.maml_experiment_vars import MOD_FUNC
    import numpy as np
    import random as rd
    import pickle

    import rllab.misc.logger as logger
    from rllab.misc.ext import set_seed
    import os

    seed = variant['seed']
    n_parallel = 1
    log_dir = variant['log_dir']

    def setup(seed, n_parallel, log_dir):

        if seed is not None:
            set_seed(seed)

        if n_parallel > 0:
            from rllab.sampler import parallel_sampler
            parallel_sampler.initialize(n_parallel=n_parallel)
            if seed is not None:
                parallel_sampler.set_seed(seed)

        if os.path.isdir(log_dir) == False:
            os.makedirs(log_dir, exist_ok=True)

        logger.set_snapshot_dir(log_dir)
        logger.add_tabular_output(log_dir + '/progress.csv')

    setup(seed, n_parallel, log_dir)

    fast_batch_size = variant['fbs']
    meta_batch_size = variant['mbs']
    adam_steps = variant['adam_steps']
    max_path_length = variant['max_path_length']

    dagger = variant['dagger']
    expert_policy_loc = variant['expert_policy_loc']

    ldim = variant['ldim']
    init_flr = variant['init_flr']
    policyType = variant['policyType']
    use_maesn = variant['use_maesn']
    EXPERT_TRAJ_LOCATION = variant['expertDataLoc']
    envType = variant['envType']

    tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'

    all_tasks = pickle.load(open(tasksFile, 'rb'))
    assert meta_batch_size <= len(all_tasks)
    tasks = all_tasks[:meta_batch_size]

    use_images = 'conv' in policyType

    if 'Push' == envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif envType == 'sparsePush':
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length,
                                rewMode='l2Sparse')

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'claw' in envType:
        env = TfEnv(DClawScrewRandGoal())

    else:
        assert True == False

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    algoClass = MAMLIL
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_policy = variant['load_policy']

    if load_policy != None:
        policy = None
        load_policy = variant['load_policy']
        # if 'conv' in load_policy:
        #     baseline = ZeroBaseline(env_spec=env.spec)

    elif 'fullAda_Bias' in policyType:

        policy = fullAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'biasAda_Bias' in policyType:

        policy = biasAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'basic' in policyType:
        policy = basic_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    elif 'conv' in policyType:

        baseline = ZeroBaseline(env_spec=env.spec)

        policy = conv_policy(
            name="policy",
            latent_dim=ldim,
            policyType=policyType,
            env_spec=env.spec,
            init_flr=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    algo = algoClass(
        env=env,
        policy=policy,
        load_policy=load_policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for alpha grad update
        max_path_length=max_path_length,
        meta_batch_size=
        meta_batch_size,  # number of tasks sampled for beta grad update
        num_grad_updates=num_grad_updates,  # number of alpha grad updates
        n_itr=variant['iterations'],
        make_video=False,
        use_maml=True,
        use_pooled_goals=True,
        use_corr_term=use_corr_term,
        test_on_training_goals=test_on_training_goals,
        metalearn_baseline=False,
        # metalearn_baseline=False,
        limit_demos_num=limit_demos_num,
        test_goals_mult=1,
        step_size=meta_step_size,
        plot=False,
        beta_steps=beta_steps,
        adam_curve=None,
        adam_steps=adam_steps,
        pre_std_modifier=pre_std_modifier,
        l2loss_std_mult=l2loss_std_mult,
        importance_sampling_modifier=MOD_FUNC[''],
        post_std_modifier=post_std_modifier,
        expert_trajs_dir=EXPERT_TRAJ_LOCATION,
        expert_trajs_suffix='',
        seed=seed,
        extra_input=extra_input,
        extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        plotDirPrefix=None,
        latent_dim=ldim,
        dagger=dagger,
        expert_policy_loc=expert_policy_loc,
        comet_logger=comet_logger)

    algo.train()
    tf.reset_default_graph()
コード例 #23
0
def train(num_experiments, thread_id, queue):

    ############ DEFAULT PARAMETERS ############

    env_name = None  #Name of adversarial environment
    path_length = 1000  #Maximum episode length
    layer_size = tuple([100, 100, 100])  #Layer definition
    ifRender = False  #Should we render?
    afterRender = 100  #After how many to animate
    n_exps = 1  #Number of training instances to run
    n_itr = 25  #Number of iterations of the alternating optimization
    n_pro_itr = 1  #Number of iterations for the protaginist
    n_adv_itr = 1  #Number of interations for the adversary
    batch_size = 4000  #Number of training samples for each iteration
    ifSave = True  #Should we save?
    save_every = 100  #Save checkpoint every save_every iterations
    n_process = 1  #Number of parallel threads for sampling environment
    adv_fraction = 0.25  #Fraction of maximum adversarial force to be applied
    step_size = 0.01  #kl step size for TRPO
    gae_lambda = 0.97  #gae_lambda for learner
    save_dir = './results'  #folder to save result in

    ############ ENV SPECIFIC PARAMETERS ############

    env_name = 'Walker2dAdv-v1'

    layer_size = tuple([64, 64])
    step_size = 0.1
    gae_lambda = 0.97
    batch_size = 25000

    n_exps = num_experiments
    n_itr = 500
    ifSave = False
    n_process = 4

    adv_fraction = 5.0
    adv_strengths = []
    for i in range(0, int(adv_fraction) + 1, 1):
        adv_strengths.append(i)

    save_dir = './../results/AdvWalker'

    args = [
        env_name, path_length, layer_size, ifRender, afterRender, n_exps,
        n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process,
        adv_fraction, step_size, gae_lambda, save_dir
    ]

    ############ ADVERSARIAL POLICY LOAD ############

    filepath = './../initial_results/Walker/env-Walker2dAdv-v1_Exp1_Itr1500_BS25000_Adv0.25_stp0.01_lam0.97_507500.p'
    res_D = pickle.load(open(filepath, 'rb'))
    pretrained_adv_policy = res_D['adv_policy']

    ############ MAIN LOOP ############

    ## Initializing summaries for the tests ##
    const_test_rew_summary = []
    rand_test_rew_summary = []
    step_test_rew_summary = []
    rand_step_test_rew_summary = []
    adv_test_rew_summary = []

    ## Preparing file to save results in ##
    save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format(
        env_name, n_exps, n_itr, batch_size, adv_fraction, step_size,
        gae_lambda, random.randint(0, 1000000))
    save_name = save_dir + '/' + save_prefix

    ## Looping over experiments to carry out ##
    for ne in range(n_exps):
        ## Environment definition ##
        ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0.
        env = normalize(GymEnv(env_name, adv_fraction))
        env_orig = normalize(GymEnv(env_name, 1.0))

        ## Protagonist policy definition ##
        pro_policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=layer_size,
                                       is_protagonist=True)
        pro_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Zero Adversary for the protagonist training ##
        zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                                is_protagonist=False,
                                                constant_val=0.0)

        ## Adversary policy definition ##
        adv_policy = pretrained_adv_policy
        adv_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Initializing the parallel sampler ##
        parallel_sampler.initialize(n_process)

        ## Setting up summaries for testing for a specific training instance ##
        pro_rews = []
        adv_rews = []
        all_rews = []
        const_testing_rews = []
        const_testing_rews.append(
            test_const_adv(env_orig, pro_policy, path_length=path_length))
        rand_testing_rews = []
        rand_testing_rews.append(
            test_rand_adv(env_orig, pro_policy, path_length=path_length))
        step_testing_rews = []
        step_testing_rews.append(
            test_step_adv(env_orig, pro_policy, path_length=path_length))
        rand_step_testing_rews = []
        rand_step_testing_rews.append(
            test_rand_step_adv(env_orig, pro_policy, path_length=path_length))
        adv_testing_rews = []
        adv_testing_rews.append(
            test_learnt_adv(env,
                            pro_policy,
                            adv_policy,
                            path_length=path_length))

        ## Loops through adversary strength levels
        n_loopsize = int(n_itr / len(adv_strengths))
        for adv_index, adv_strength in enumerate(adv_strengths):

            env = normalize(GymEnv(env_name, adv_strength))

            ## Optimizer for the Protagonist ##
            pro_algo = TRPO(env=env,
                            pro_policy=pro_policy,
                            adv_policy=adv_policy,
                            pro_baseline=pro_baseline,
                            adv_baseline=adv_baseline,
                            batch_size=batch_size,
                            max_path_length=path_length,
                            n_itr=n_pro_itr,
                            discount=0.995,
                            gae_lambda=gae_lambda,
                            step_size=step_size,
                            is_protagonist=True)

            logger.log(
                '\n\nAdversarial Level: {} Adversarial Strength: {}\n'.format(
                    adv_index, adv_strength))

            ## Beginning alternating optimization ##
            for ni in range(n_loopsize):
                logger.log(
                    '\n\nThread: {} Experiment: {} Iteration: {}\n'.format(
                        thread_id,
                        ne,
                        ni + n_loopsize * adv_index,
                    ))

                ## Train Protagonist
                pro_algo.train()
                pro_rews += pro_algo.rews
                all_rews += pro_algo.rews
                logger.log('Protag Reward: {}'.format(
                    np.array(pro_algo.rews).mean()))

                ## Test the learnt policies
                const_testing_rews.append(
                    test_const_adv(env, pro_policy, path_length=path_length))
                rand_testing_rews.append(
                    test_rand_adv(env, pro_policy, path_length=path_length))
                step_testing_rews.append(
                    test_step_adv(env, pro_policy, path_length=path_length))
                rand_step_testing_rews.append(
                    test_rand_step_adv(env,
                                       pro_policy,
                                       path_length=path_length))
                adv_testing_rews.append(
                    test_learnt_adv(env,
                                    pro_policy,
                                    adv_policy,
                                    path_length=path_length))

                if ni % afterRender == 0 and ifRender == True:
                    test_const_adv(env,
                                   pro_policy,
                                   path_length=path_length,
                                   n_traj=1,
                                   render=True)

                if ni != 0 and ni % save_every == 0 and ifSave == True:
                    ## SAVING CHECKPOINT INFO ##
                    pickle.dump(
                        {
                            'args': args,
                            'pro_policy': pro_policy,
                            'adv_policy': adv_policy,
                            'zero_test': [const_testing_rews],
                            'rand_test': [rand_testing_rews],
                            'step_test': [step_testing_rews],
                            'rand_step_test': [rand_step_testing_rews],
                            'iter_save': ni,
                            'exp_save': ne,
                            'adv_test': [adv_testing_rews]
                        },
                        open(
                            save_name + '_' +
                            str(ni + n_loopsize * adv_index) + '.p', 'wb'))

        ## Shutting down the optimizer ##
        pro_algo.shutdown_worker()

        ## Updating the test summaries over all training instances
        const_test_rew_summary.append(const_testing_rews)
        rand_test_rew_summary.append(rand_testing_rews)
        step_test_rew_summary.append(step_testing_rews)
        rand_step_test_rew_summary.append(rand_step_testing_rews)
        adv_test_rew_summary.append(adv_testing_rews)

    queue.put([
        const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary,
        rand_step_test_rew_summary, adv_test_rew_summary
    ])

    ############ SAVING MODEL ############
    '''
コード例 #24
0
        z.append(y * t)
        t *= discount
    return np.array(z)


load_policy = True
# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(CartpoleEnv())
#env = GymEnv("InvertedPendulum-v1")
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(100, 50, 25))
snap_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(100, 50, 25))
back_up_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(100, 50, 25))
parallel_sampler.populate_task(env, policy)
baseline = LinearFeatureBaseline(env.spec)
baseline_snap = LinearFeatureBaseline(env.spec)

# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution
snap_dist = snap_policy.distribution
# We will collect 100 trajectories per iteration
N = 100
# Each trajectory will have at most 100 time steps
T = 500
#We will collect M secondary trajectories
M = 20
#Number of sub-iterations
コード例 #25
0
def run_FaReLI(input_feed=None):
    beta_adam_steps_list = [(1,50)]
    # beta_curve = [250,250,250,250,250,5,5,5,5,1,1,1,1,] # make sure to check maml_experiment_vars
    # beta_curve = [1000] # make sure to check maml_experiment_vars
    adam_curve = [250,249,248,247,245,50,50,10] # make sure to check maml_experiment_vars
    # adam_curve = None

    fast_learning_rates = [1.0]
    baselines = ['linear',]  # linear GaussianMLP MAMLGaussianMLP zero
    env_option = ''
    # mode = "ec2"
    mode = "local"
    extra_input = "onehot_exploration" # "onehot_exploration" "gaussian_exploration"
    # extra_input = None
    extra_input_dim = 5
    # extra_input_dim = None
    goals_suffixes = ["_200_40_1"] #,"_200_40_2", "_200_40_3","_200_40_4"]
    # goals_suffixes = ["_1000_40"]

    fast_batch_size_list = [20]  # 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]  #inner grad update size
    meta_batch_size_list = [40]  # 40 @ 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 100  # 100
    num_grad_updates = 1
    meta_step_size = 0.01
    pre_std_modifier_list = [1.0]
    post_std_modifier_train_list = [0.00001]
    post_std_modifier_test_list = [0.00001]
    l2loss_std_mult_list = [1.0]
    importance_sampling_modifier_list = ['']  #'', 'clip0.5_'
    limit_demos_num_list = [1]  # 40
    test_goals_mult = 1
    bas_lr = 0.01 # baseline learning rate
    momentum=0.5
    bas_hnl = tf.nn.relu
    baslayers_list = [(32,32), ]

    basas = 60 # baseline adam steps
    use_corr_term = True
    seeds = [1] #,2,3,4,5]
    envseeds = [6]
    use_maml = True
    test_on_training_goals = False
    for goals_suffix in goals_suffixes:
        for envseed in envseeds:
            for seed in seeds:
                for baslayers in baslayers_list:
                    for fast_batch_size in fast_batch_size_list:
                        for meta_batch_size in meta_batch_size_list:
                            for ism in importance_sampling_modifier_list:
                                for limit_demos_num in limit_demos_num_list:
                                    for l2loss_std_mult in l2loss_std_mult_list:
                                        for post_std_modifier_train in post_std_modifier_train_list:
                                            for post_std_modifier_test in post_std_modifier_test_list:
                                                for pre_std_modifier in pre_std_modifier_list:
                                                    for fast_learning_rate in fast_learning_rates:
                                                        for beta_steps, adam_steps in beta_adam_steps_list:
                                                            for bas in baselines:
                                                                stub(globals())
                                                                tf.set_random_seed(seed)
                                                                np.random.seed(seed)
                                                                rd.seed(seed)
                                                                env = TfEnv(normalize(Reacher7DofMultitaskEnv(envseed=envseed)))
                                                                exp_name = str(
                                                                    'R7_IL'
                                                                    # +time.strftime("%D").replace("/", "")[0:4]
                                                                    + goals_suffix + "_"
                                                                    + str(seed)
                                                                    # + str(envseed)
                                                                    + ("" if use_corr_term else "nocorr")
                                                                    # + str(int(use_maml))
                                                                    + ('_fbs' + str(fast_batch_size) if fast_batch_size!=20 else "")
                                                                    + ('_mbs' + str(meta_batch_size) if meta_batch_size!=40 else "")
                                                                    + ('_flr' + str(fast_learning_rate) if fast_learning_rate!=1.0 else "")
                                                                    + '_dem' + str(limit_demos_num)
                                                                    + ('_ei' + str(extra_input_dim) if type(
                                                                        extra_input_dim) == int else "")
                                                                    # + '_tgm' + str(test_goals_mult)
                                                                    #     +'metalr_'+str(meta_step_size)
                                                                    #     +'_ngrad'+str(num_grad_updates)
                                                                    + ("_bs" + str(beta_steps) if beta_steps != 1 else "")
                                                                    + "_as" + str(adam_steps)
                                                                    # +"_net" + str(net_size[0])
                                                                    # +"_L2m" + str(l2loss_std_mult)
                                                                    + ("_prsm" + str(
                                                                        pre_std_modifier) if pre_std_modifier != 1 else "")
                                                                    # + "_pstr" + str(post_std_modifier_train)
                                                                    # + "_posm" + str(post_std_modifier_test)
                                                                    #  + "_l2m" + str(l2loss_std_mult)
                                                                    + ("_" + ism if len(ism) > 0 else "")
                                                                    + "_bas" + bas[0]
                                                                    # +"_tfbe" # TF backend for baseline
                                                                    # +"_qdo" # quad dist optimizer
                                                                    + (("_bi" if bas_hnl == tf.identity else (
                                                                        "_brel" if bas_hnl == tf.nn.relu else "_bth"))  # identity or relu or tanh for baseline
                                                                       # + "_" + str(baslayers)  # size
                                                                       + "_baslr" + str(bas_lr)
                                                                       + "_basas" + str(basas) if bas[0] in ["G",
                                                                                                             "M"] else "")  # baseline adam steps
                                                                    + ("r" if test_on_training_goals else "")
                                                                    + "_" + time.strftime("%d%m_%H_%M"))



                                                                policy = MAMLGaussianMLPPolicy(
                                                                    name="policy",
                                                                    env_spec=env.spec,
                                                                    grad_step_size=fast_learning_rate,
                                                                    hidden_nonlinearity=tf.nn.relu,
                                                                    hidden_sizes=(100, 100),
                                                                    std_modifier=pre_std_modifier,
                                                                    # metalearn_baseline=(bas == "MAMLGaussianMLP"),
                                                                    extra_input_dim=(0 if extra_input is None else extra_input_dim),
                                                                )
                                                                if bas == 'zero':
                                                                    baseline = ZeroBaseline(env_spec=env.spec)
                                                                elif bas == 'MAMLGaussianMLP':
                                                                    baseline = MAMLGaussianMLPBaseline(env_spec=env.spec,
                                                                                                       learning_rate=bas_lr,
                                                                                                       hidden_sizes=baslayers,
                                                                                                       hidden_nonlinearity=bas_hnl,
                                                                                                       repeat=basas,
                                                                                                       repeat_sym=basas,
                                                                                                       momentum=momentum,
                                                                                                       extra_input_dim=( 0 if extra_input is None else extra_input_dim),

                                                                                                       # learn_std=False,
                                                                                                       # use_trust_region=False,
                                                                                                       # optimizer=QuadDistExpertOptimizer(
                                                                                                       #      name="bas_optimizer",
                                                                                                       #     #  tf_optimizer_cls=tf.train.GradientDescentOptimizer,
                                                                                                       #     #  tf_optimizer_args=dict(
                                                                                                       #     #      learning_rate=bas_lr,
                                                                                                       #     #  ),
                                                                                                       #     # # tf_optimizer_cls=tf.train.AdamOptimizer,
                                                                                                       #     # max_epochs=200,
                                                                                                       #     # batch_size=None,
                                                                                                       #      adam_steps=basas
                                                                                                       #     )
                                                                                                       )

                                                                elif bas == 'linear':
                                                                    baseline = LinearFeatureBaseline(env_spec=env.spec)
                                                                elif "GaussianMLP" in bas:
                                                                    baseline = GaussianMLPBaseline(env_spec=env.spec,
                                                                                                   regressor_args=dict(
                                                                                                       hidden_sizes=baslayers,
                                                                                                       hidden_nonlinearity=bas_hnl,
                                                                                                       learn_std=False,
                                                                                                       # use_trust_region=False,
                                                                                                       # normalize_inputs=False,
                                                                                                       # normalize_outputs=False,
                                                                                                       optimizer=QuadDistExpertOptimizer(
                                                                                                           name="bas_optimizer",
                                                                                                           #  tf_optimizer_cls=tf.train.GradientDescentOptimizer,
                                                                                                           #  tf_optimizer_args=dict(
                                                                                                           #      learning_rate=bas_lr,
                                                                                                           #  ),
                                                                                                           # # tf_optimizer_cls=tf.train.AdamOptimizer,
                                                                                                           # max_epochs=200,
                                                                                                           # batch_size=None,
                                                                                                           adam_steps=basas,
                                                                                                           use_momentum_optimizer=True,
                                                                                                       )))
                                                                algo = MAMLIL(
                                                                    env=env,
                                                                    policy=policy,
                                                                    baseline=baseline,
                                                                    batch_size=fast_batch_size,  # number of trajs for alpha grad update
                                                                    max_path_length=max_path_length,
                                                                    meta_batch_size=meta_batch_size,  # number of tasks sampled for beta grad update
                                                                    num_grad_updates=num_grad_updates,  # number of alpha grad updates
                                                                    n_itr=800, #100
                                                                    make_video=True,
                                                                    use_maml=use_maml,
                                                                    use_pooled_goals=True,
                                                                    use_corr_term=use_corr_term,
                                                                    test_on_training_goals=test_on_training_goals,
                                                                    metalearn_baseline=(bas=="MAMLGaussianMLP"),
                                                                    # metalearn_baseline=False,
                                                                    limit_demos_num=limit_demos_num,
                                                                    test_goals_mult=test_goals_mult,
                                                                    step_size=meta_step_size,
                                                                    plot=False,
                                                                    beta_steps=beta_steps,
                                                                    adam_curve=adam_curve,
                                                                    adam_steps=adam_steps,
                                                                    pre_std_modifier=pre_std_modifier,
                                                                    l2loss_std_mult=l2loss_std_mult,
                                                                    importance_sampling_modifier=MOD_FUNC[ism],
                                                                    post_std_modifier_train=post_std_modifier_train,
                                                                    post_std_modifier_test=post_std_modifier_test,
                                                                    expert_trajs_dir=EXPERT_TRAJ_LOCATION_DICT[env_option+"."+mode+goals_suffix+("_"+str(extra_input_dim) if type(extra_input_dim) == int else "")],
                                                                    expert_trajs_suffix=("_"+str(extra_input_dim) if type(extra_input_dim) == int else ""),
                                                                    seed=seed,
                                                                    extra_input=extra_input,
                                                                    extra_input_dim=(0 if extra_input is None else extra_input_dim),
                                                                    input_feed=input_feed,
                                                                    run_on_pr2=False,

                                                                )
                                                                run_experiment_lite(
                                                                    algo.train(),
                                                                    n_parallel=1,
                                                                    snapshot_mode="last",
                                                                    python_command='python3',
                                                                    seed=seed,
                                                                    exp_prefix=str('R7_IL_'
                                                                                   +time.strftime("%D").replace("/", "")[0:4]),
                                                                    exp_name=exp_name,
                                                                    plot=False,
                                                                    sync_s3_pkl=True,
                                                                    mode=mode,
                                                                    terminate_machine=True,
                                                                )
コード例 #26
0
ファイル: maze_gan_algo.py プロジェクト: shenghuanjie/dcl
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(PointMazeEnv(maze_id=v['maze_id']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    uniform_start_generator = UniformStateGenerator(state_size=v['start_size'],
                                                    bounds=v['start_range'],
                                                    center=v['start_center'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=uniform_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[:v['goal_size']],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        only_feasible=v['only_feasible'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0

    logger.log('Generating the Initial Heatmap...')
    plot_policy_means(policy,
                      env,
                      sampling_res=2,
                      report=report,
                      limit=v['start_range'],
                      center=v['start_center'])
    # test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'],
    #                      itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key +
                                                                  '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(
                stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['start_size'],
        evaluater_size=v['num_labels'],
        state_range=v['start_range'],
        state_center=v['start_center'],
        state_noise_level=v['start_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )
    logger.log("pretraining the GAN...")
    if v['smart_init']:
        feasible_starts = generate_starts(
            env, starts=[v['ultimate_goal']],
            horizon=50)  # without giving the policy it does brownian mo.
        labels = np.ones((feasible_starts.shape[0],
                          2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        dis_loss, gen_loss = gan.pretrain(states=feasible_starts,
                                          outer_iters=v['gan_outer_iters'])
        print("Loss of Gen and Dis: ", gen_loss, dis_loss)
    else:
        gan.pretrain_uniform(outer_iters=500,
                             report=report)  # v['gan_outer_iters'])

    # log first samples form the GAN
    initial_starts, _ = gan.sample_states_with_noise(v['num_new_starts'])

    logger.log("Labeling the starts")
    labels = label_states(initial_starts,
                          env,
                          policy,
                          v['horizon'],
                          as_goals=False,
                          n_traj=v['n_traj'],
                          key='goal_reached')

    plot_labeled_states(initial_starts,
                        labels,
                        report=report,
                        itr=outer_iter,
                        limit=v['goal_range'],
                        center=v['goal_center'],
                        maze_id=v['maze_id'])
    report.new_row()

    all_starts = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        # Sample GAN
        logger.log("Sampling starts from the GAN")
        raw_starts, _ = gan.sample_states_with_noise(v['num_new_starts'])

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([raw_starts, old_starts])
        else:
            starts = raw_starts

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                discount=v['discount'],
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)
        plot_labeled_states(starts,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            maze_id=v['maze_id'])

        logger.log('Generating the Heatmap...')
        plot_policy_means(policy,
                          env,
                          sampling_res=2,
                          report=report,
                          limit=v['start_range'],
                          center=v['start_center'])
        test_and_plot_policy(policy,
                             env,
                             as_goals=False,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'])

        # ###### extra for deterministic:
        # logger.log("Labeling the goals deterministic")
        # with policy.set_std_to_0():
        #     labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1)
        # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'])

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Training the GAN")
        if np.any(labels):
            gan.train(
                starts,
                labels,
                v['gan_outer_iters'],
            )

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_start = [
            start for start, label in zip(starts, labels) if label[0] == 1
        ]
        all_starts.append(filtered_raw_start)
コード例 #27
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0
    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         limit=v['goal_range'],
                         center=v['goal_center'],
                         bounds=v['goal_range'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key +
                                                                  '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(
                stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['goal_size'],
        evaluater_size=v['num_labels'],
        state_range=v['goal_range'],
        state_center=v['goal_center'],
        state_noise_level=v['goal_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )

    # log first samples form the GAN
    initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

    logger.log("Labeling the goals")
    labels = label_states(initial_goals,
                          env,
                          policy,
                          v['horizon'],
                          n_traj=v['n_traj'],
                          key='goal_reached')

    plot_labeled_states(initial_goals,
                        labels,
                        report=report,
                        itr=outer_iter,
                        limit=v['goal_range'],
                        center=v['goal_center'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        feasible_goals = generate_initial_goals(env,
                                                policy,
                                                v['goal_range'],
                                                goal_center=v['goal_center'],
                                                horizon=v['horizon'])
        labels = np.ones((feasible_goals.shape[0],
                          2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            summary_string_base='On-policy Goals:\n')
        if v['only_on_policy']:
            goals = feasible_goals[np.random.choice(
                feasible_goals.shape[0], v['num_new_goals'], replace=False), :]
        else:
            logger.log("Training the GAN")
            gan.pretrain(feasible_goals, v['gan_outer_iters'])
            # Sample GAN
            logger.log("Sampling goals from the GAN")
            raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

            if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
                old_goals = all_goals.sample(v['num_old_goals'])
                goals = np.vstack([raw_goals, old_goals])
            else:
                goals = raw_goals

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [goals, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=True,
                env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(goals,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=True,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'],
                             bounds=v['goal_range'])

        plot_labeled_states(goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'])

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [
            goal for goal, label in zip(goals, labels) if label[0] == 1
        ]  # this is not used if no replay buffer
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(
                env,
                policy,
                v['goal_range'],
                goal_center=v['goal_center'],
                horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
コード例 #28
0
def run_task(*_):
    auton_cars = 20

    sumo_params = SumoParams(time_step=0.1,
                             human_speed_mode="no_collide",
                             rl_speed_mode="no_collide",
                             sumo_binary="sumo-gui")

    vehicles = Vehicles()
    vehicles.add_vehicles("idm", (RLController, {}), None, None, 0, 20)

    intensity = .2
    v_enter = 10
    env_params = EnvParams(additional_params={
        "target_velocity": v_enter,
        "control-length": 150,
        "max_speed": v_enter
    })

    additional_net_params = {
        "horizontal_length_in": 400,
        "horizontal_length_out": 800,
        "horizontal_lanes": 1,
        "vertical_length_in": 400,
        "vertical_length_out": 800,
        "vertical_lanes": 1,
        "speed_limit": {
            "horizontal": v_enter,
            "vertical": v_enter
        }
    }
    net_params = NetParams(no_internal_links=False,
                           additional_params=additional_net_params)

    cfg_params = {"start_time": 0, "end_time": 3000, "cfg_path": "debug/cfg/"}

    initial_config = InitialConfig(spacing="custom",
                                   additional_params={
                                       "intensity": intensity,
                                       "enter_speed": v_enter
                                   })

    scenario = TwoWayIntersectionScenario("two-way-intersection",
                                          TwoWayIntersectionGenerator,
                                          vehicles,
                                          net_params,
                                          initial_config=initial_config)

    env = TwoIntersectionEnvironment(env_params, sumo_params, scenario)
    env_name = "TwoIntersectionEnvironment"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)
    logging.info("Experiment Set Up complete")

    print("experiment initialized")

    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=30000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=200,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
コード例 #29
0
if env.spec.action_space == 'Discrete':
    policy = CategoricalMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
    )
else:
    policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(100, 50, 25)
    )

baseline = LinearFeatureBaseline(env_spec=env.spec)

iters = args.num_iters

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=args.batch_size, # Mujoco tasks need 20000-50000
    max_path_length=env.horizon, # And 500
    n_itr=iters,
    discount=0.99,
    step_size=0.01,
    optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
)
コード例 #30
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    V_ENTER = 30
    INNER_LENGTH = 300
    LONG_LENGTH = 100
    SHORT_LENGTH = 300
    N_ROWS = 3
    N_COLUMNS = 3
    NUM_CARS_LEFT = 1
    NUM_CARS_RIGHT = 1
    NUM_CARS_TOP = 1
    NUM_CARS_BOT = 1
    tot_cars = (NUM_CARS_LEFT + NUM_CARS_RIGHT) * N_COLUMNS \
        + (NUM_CARS_BOT + NUM_CARS_TOP) * N_ROWS

    grid_array = {
        "short_length": SHORT_LENGTH,
        "inner_length": INNER_LENGTH,
        "long_length": LONG_LENGTH,
        "row_num": N_ROWS,
        "col_num": N_COLUMNS,
        "cars_left": NUM_CARS_LEFT,
        "cars_right": NUM_CARS_RIGHT,
        "cars_top": NUM_CARS_TOP,
        "cars_bot": NUM_CARS_BOT
    }

    sim_params = SumoParams(sim_step=1, render=True)

    vehicles = VehicleParams()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SimCarFollowingController, {}),
                 car_following_params=SumoCarFollowingParams(
                     min_gap=2.5,
                     tau=1.1,
                     max_speed=V_ENTER,
                     speed_mode="all_checks"),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars)

    tl_logic = TrafficLightParams(baseline=False)

    additional_env_params = {
        "target_velocity": 50,
        "switch_time": 3.0,
        "num_observed": 2,
        "discrete": False,
        "tl_type": "controlled"
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1
    }

    if USE_INFLOWS:
        initial_config, net_params = get_flow_params(
            v_enter=V_ENTER,
            vehs_per_hour=EDGE_INFLOW,
            col_num=N_COLUMNS,
            row_num=N_ROWS,
            add_net_params=additional_net_params)
    else:
        initial_config, net_params = get_non_flow_params(
            V_ENTER, additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config,
                                  traffic_lights=tl_logic)

    env_name = "PO_TrafficLightGridEnv"
    pass_params = (env_name, sim_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
コード例 #31
0
ファイル: vpg_2.py プロジェクト: QuantCollective/maml_rl
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.envs.normalized_env import normalize
import numpy as np
import theano
import theano.tensor as TT
from lasagne.updates import adam

# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(CartpoleEnv())
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
# Initialize a linear baseline estimator using default hand-crafted features
baseline = LinearFeatureBaseline(env.spec)

# We will collect 100 trajectories per iteration
N = 100
# Each trajectory will have at most 100 time steps
T = 100
# Number of iterations
n_itr = 100
# Set the discount factor for the problem
discount = 0.99
# Learning rate for the gradient update
learning_rate = 0.1

# Construct the computation graph

# Create a Theano variable for storing the observations
コード例 #32
0
    def train(self):

        expert_env = TfEnv(
            self.expert_env
        )  #TfEnv(GymEnv("Pusher3DOF-v1", force_reset=True, record_video=False))
        # expert_env = TfEnv(normalize(ReacherEnv()))
        novice_env = TfEnv(
            self.novice_env
        )  #TfEnv(GymEnv("Pusher3DOFNoChange-v1", force_reset=True, record_video=True))

        # novice_env = TfEnv(normalize(ReacherTwoEnv(), normalize_obs=True))
        expert_fail_pol = RandomPolicy(expert_env.spec)

        policy = GaussianMLPPolicy(
            name="novice_policy",
            env_spec=novice_env.spec,
            init_std=10,
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=expert_env.spec)

        algo = TRPO(env=novice_env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=4000,
                    max_path_length=self.horizon,
                    n_itr=self.itrs,
                    discount=0.99,
                    step_size=0.01,
                    optimizer=ConjugateGradientOptimizer(
                        hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:

            #What do the n_itr and start_itr mean?
            algo.n_itr = 0
            algo.start_itr = 0
            algo.train(sess=sess)  #TODO: What is happening here?

            im_height = self.imsize[0]
            im_width = self.imsize[1]
            im_channels = 3

            dim_input = [im_height, im_width, im_channels]

            disc = ConvDiscriminator(input_dim=dim_input)

            #data = joblib.load(self.expert_pkl)#"/home/andrewliu/research/viewpoint/rllab-tpil/third_person_im/data/local/experiment/experiment_2017_05_07_20_58_39_0001/itr_123.pkl")#"/home/abhigupta/abhishek_sandbox/viewpoint/third_person_im/data/local/experiment/experiment_2017_05_06_18_07_38_0001/itr_900.pkl")
            #expert_policy = data['policy']
            with open(self.expert_pkl, 'rb') as pfile:
                expert_policy = pickle.load(pfile)
            # expert_policy = load_expert_reacher(expert_env, sess) #Load the expert #TODO: Need to train the expert

            #from rllab.sampler.utils import rollout
            #while True:
            #        t = rollout(env=expert_env, agent=expert_policy, max_path_length=50, animated=True)

            algo.n_itr = self.itrs
            trainer = CyberPunkTrainerGAIL(disc=disc,
                                           novice_policy_env=novice_env,
                                           expert_env=expert_env,
                                           novice_policy=policy,
                                           novice_policy_opt_algo=algo,
                                           expert_success_pol=expert_policy,
                                           im_width=im_width,
                                           im_height=im_height,
                                           im_channels=im_channels,
                                           tf_sess=sess,
                                           horizon=self.horizon)

            iterations = self.itrs
            for iter_step in range(0, iterations):
                logger.record_tabular('Iteration', iter_step)
                trainer.take_iteration(n_trajs_cost=self.trajs,
                                       n_trajs_policy=self.trajs)
                logger.dump_tabular(with_prefix=False)

            trainer.log_and_finish()