Python RandPolicy Examples

Programming Language: Python

Namespace/Package Name: kusanagi.ghost.control

Method/Function: RandPolicy

Examples at hotexamples.com: 2

Python RandPolicy - 2 examples found. These are the top rated real world Python examples of kusanagi.ghost.control.RandPolicy extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: experiment_utils.py Project: helloxss/kusanagi

def evaluate_policy(env, input_pol, exp, params, n_tests=100, render=False):
    H = params['min_steps']
    angle_dims = params['angle_dims']

    def gTrig(state):
        return utils.gTrig_np(state, angle_dims).flatten()

    def step_cb(*args, **kwargs):
        if render:
            env.render()

    results = []
    for i, p in enumerate(exp.policy_parameters):
        utils.print_with_stamp('Evaluating policy at iteration %d' % i)
        if p:
            input_pol.set_params(p)
            pol = input_pol
        else:
            pol = control.RandPolicy(maxU=input_pol.maxU)

        results_i = []
        for it in range(n_tests):
            ret = apply_controller(env,
                                   pol,
                                   H,
                                   preprocess=gTrig,
                                   callback=step_cb)
            results_i.append(ret)
        results.append(results_i)

    return results

Example #2

Show file

def run_pilco_experiment(exp_setup=mcpilco_cartpole_experiment,
                         params=None,
                         loss_kwargs={},
                         polopt_kwargs={},
                         extra_inps=[],
                         step_cb=None,
                         polopt_cb=None,
                         learning_iteration_cb=None,
                         max_dataset_size=0,
                         render=False,
                         debug_plot=0):
    # setup experiment
    exp_objs = exp_setup(params)
    p0, env, pol, dyn, cost, exp, polopt, learner, params = exp_objs
    n_rnd = params.get('n_rnd', 1)
    n_opt = params.get('n_opt', 100)
    return_best = params.get('return_best', False)
    crn_dropout = params.get('crn_dropout', True)
    H = params.get('min_steps', 100)
    gamma = params.get('discount', 1.0)
    angle_dims = params.get('angle_dims', [])

    # init callbacks
    # callback executed after every call to env.step
    def step_cb_internal(state, action, cost, info):
        exp.add_sample(state, action, cost, info)
        if render:
            env.render()
        if callable(step_cb):
            step_cb(state, action, cost, info)

    def polopt_cb_internal(*args, **kwargs):
        if not crn_dropout:
            if hasattr(dyn, 'update'):
                dyn.update()
            if hasattr(pol, 'update'):
                pol.update()
        if callable(polopt_cb):
            polopt_cb(*args, **kwargs)

    # function to execute before applying policy
    def gTrig(state):
        return utils.gTrig_np(state, angle_dims).flatten()

    # collect experience with random controls
    randpol = control.RandPolicy(maxU=pol.maxU)
    for i in range(n_rnd):
        exp.new_episode()
        if n_rnd > 1 and i == n_rnd - 1:
            p = pol
            utils.print_with_stamp('Executing initial policy')
        else:
            p = randpol
            utils.print_with_stamp('Executing uniformly-random controls')
        apply_controller(env,
                         p,
                         H,
                         preprocess=gTrig,
                         callback=step_cb_internal)

    # 1. train dynamics once
    train_dynamics(dyn,
                   exp,
                   angle_dims=angle_dims,
                   max_dataset_size=max_dataset_size)

    # build loss function
    loss, inps, updts = learner.get_loss(pol, dyn, cost, angle_dims,
                                         **loss_kwargs)

    if debug_plot > 0:
        # build rollout function for plotting
        loss_kwargs['mm_state'] = False
        rollout_fn = learner.build_rollout(pol, dyn, cost, angle_dims,
                                           **loss_kwargs)
        fig, axarr = None, None

    # set objective of policy optimizer
    inps += extra_inps
    polopt.set_objective(loss, pol.get_params(symbolic=True), inps, updts,
                         **polopt_kwargs)

    # initial call so that the user gets the state before
    # the first learrning iteration
    if callable(learning_iteration_cb):
        learning_iteration_cb(exp, dyn, pol, polopt, params)

    if crn_dropout:
        utils.print_with_stamp('using common random numbers for dyn and pol',
                               'experiment_utils')
    else:
        utils.print_with_stamp('resampling weights for dyn and pol',
                               'experiment_utils')
    for i in range(n_opt):
        total_exp = sum([len(st) for st in exp.states])
        msg = '==== Iteration [%d], experience: [%d steps] ===='
        utils.print_with_stamp(msg % (i + 1, total_exp))
        if crn_dropout:
            if hasattr(dyn, 'update'):
                dyn.update()
            if hasattr(pol, 'update'):
                pol.update()

        # get initial state distribution (assumed gaussian)
        x0 = np.array([st[0] for st in exp.states])
        m0 = x0.mean(0)
        S0 = np.cov(x0, rowvar=False, ddof=1) +\
            1e-4*np.eye(x0.shape[1]) if len(x0) > 10 else p0.cov

        # 2. optimize policy
        polopt_args = [m0, S0, H, gamma]
        if isinstance(polopt, optimizers.SGDOptimizer):
            # check if we have a learning rate parameter
            lr = params.get('learning_rate', 1e-4)
            if callable(lr):
                lr = lr(i)
            polopt_args.append(lr)
        polopt.minimize(*polopt_args,
                        callback=polopt_cb_internal,
                        return_best=return_best)

        # 3. apply controller
        exp.new_episode(policy_params=pol.get_params(symbolic=False))
        apply_controller(env,
                         pol,
                         H,
                         preprocess=gTrig,
                         callback=step_cb_internal)
        # 4. train dynamics once
        train_dynamics(dyn,
                       exp,
                       angle_dims=angle_dims,
                       max_dataset_size=max_dataset_size)

        if callable(learning_iteration_cb):
            # user callback
            learning_iteration_cb(exp, dyn, pol, polopt, params)

        if debug_plot > 0:
            fig, axarr = plot_rollout(rollout_fn,
                                      exp,
                                      m0,
                                      S0,
                                      H,
                                      gamma,
                                      fig=fig,
                                      axarr=axarr)
    env.close()