Esempio n. 1
0
def setup_double_cartpole_experiment(params=None):
    # get experiment parameters
    if params is None:
        params = cartpole.default_params()

    # init environment
    env = double_cartpole.DoubleCartpole(**params['plant'])

    # init cost model
    cost = partial(double_cartpole.double_cartpole_loss, **params['cost'])

    return env, cost, params
def main_loop():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, choices=['cartpole', 'double_cartpole', 'pendulum'], default='cartpole')
    parser.add_argument("--discount_factor", type=float, default=.995)
    parser.add_argument("--gather_data_epochs", type=int, default=3, help='Epochs for initial data gather.')
    parser.add_argument("--train_hp_iterations", type=int, default=2000*10)
    parser.add_argument("--train_policy_batch_size", type=int, default=30)
    parser.add_argument("--no_samples", type=int, default=1)
    parser.add_argument("--basis_dim", type=int, default=256)
    parser.add_argument("--hidden_dim", type=int, default=32)
    parser.add_argument("--rffm_seed", type=int, default=1)
    parser.add_argument("--Agent", type=str, choices=['', '2'], default='')
    parser.add_argument("--max_train_hp_datapoints", type=int, default=20000)
    parser.add_argument("--update_hyperstate", type=int, default=1)
    parser.add_argument("--policy_use_hyperstate", type=int, default=1)
    parser.add_argument("--cma_maxiter", type=int, default=1000)
    parser.add_argument("--learn_diff", type=int, choices=[0, 1], default=0)
    parser.add_argument("--dump_model", type=int, choices=[0, 1], default=0)
    args = parser.parse_args()

    print(sys.argv)
    print(args)
    from blr_regression2_sans_hyperstate_kusanagi_multioutput import Agent2

    if args.env == 'cartpole':
        params = cartpole.default_params()
        cost = partial(cartpole.cartpole_loss, **params['cost'])
        env = cartpole.Cartpole(loss_func=cost, **params['plant'])
        max_steps = 25
        maxA = 10.
    elif args.env == 'double_cartpole':
        params = double_cartpole.default_params()
        cost = partial(double_cartpole.double_cartpole_loss, **params['cost'])
        env = double_cartpole.DoubleCartpole(loss_func=cost, **params['plant'])
        max_steps = 30
        maxA = 20.
    elif args.env == 'pendulum':
        params = pendulum.default_params()
        cost = partial(pendulum.pendulum_loss, **params['cost'])
        env = pendulum.Pendulum(loss_func=cost, **params['plant'])
        max_steps = 40
        maxA = 2.5
    else:
        raise Exception('Unknown environment.')


    regression_wrapper_state = MultiOutputRegressionWrapper(input_dim=env.observation_space.shape[0]+env.action_space.shape[0],
                                                            output_dim=env.observation_space.shape[0],
                                                            basis_dim=args.basis_dim,
                                                            length_scale=1.,
                                                            signal_sd=1.,
                                                            noise_sd=5e-4,
                                                            prior_sd=1.,
                                                            rffm_seed=args.rffm_seed,
                                                            train_hp_iterations=args.train_hp_iterations)
    agent = eval('Agent'+args.Agent)(env=env,
                                     x_dim=env.observation_space.shape[0]+env.action_space.shape[0],
                                     y_dim=env.observation_space.shape[0],
                                     state_dim=env.observation_space.shape[0],
                                     action_dim=env.action_space.shape[0],
                                     observation_space_low=env.observation_space.low,
                                     observation_space_high=env.observation_space.high,
                                     action_space_low=np.array([-maxA]),
                                     action_space_high=np.array([maxA]),
                                     unroll_steps=max_steps,
                                     no_samples=args.no_samples,
                                     discount_factor=args.discount_factor,

                                     random_matrix_state=regression_wrapper_state.random_matrix,
                                     bias_state=regression_wrapper_state.bias,
                                     basis_dim_state=regression_wrapper_state.basis_dim,




                                     hidden_dim=args.hidden_dim,
                                     update_hyperstate=args.update_hyperstate,
                                     policy_use_hyperstate=args.policy_use_hyperstate,
                                     learn_diff=args.learn_diff,
                                     dump_model=args.dump_model)


    #I have to work on the classes before working on the code below.
    flag = False
    from utils import get_data3
    data_buffer = get_data3(env, trials=args.gather_data_epochs, max_steps=max_steps, maxA=maxA)

    init_states = np.stack([env.reset() for _ in range(args.train_policy_batch_size)], axis=0)


    for epoch in range(1000):
        #Train hyperparameters and update systems model.
        states_actions, states, rewards, next_states = unpack(data_buffer)

        next_states_train = next_states.copy() - states.copy() if args.learn_diff else next_states.copy()

        if flag == False:
            regression_wrapper_state._train_hyperparameters(states_actions, next_states_train)
            regression_wrapper_state._reset_statistics(states_actions, next_states_train)
        else:
            regression_wrapper_state._update(states_actions, next_states_train)

        if len(data_buffer) >= args.max_train_hp_datapoints: flag = True
        if flag: data_buffer = []
        tmp_data_buffer = []

        #Fit policy network.
        #XX, Xy, hyperparameters = zip(*[[rw.XX, rw.Xy, rw.hyperparameters] for rw in regression_wrappers])
        #eval('agent.'+args.fit_function)(args.cma_maxiter, np.copy(init_states), [np.copy(ele) for ele in XX], [np.copy(ele) for ele in Xy], [np.copy(ele) for ele in hyperparameters], sess)
        agent._fit(args.cma_maxiter,
                   init_states.copy(),
                   regression_wrapper_state.XX.copy(),
                   regression_wrapper_state.Xy.copy(),
                   regression_wrapper_state.hyperparameters.copy())

        #Get hyperstate & hyperparameters
        hyperstate_params = [regression_wrapper_state.Llower.copy()[None, ...],
                             regression_wrapper_state.Xy.copy()[None, ...]]
        total_rewards = 0.
        state = env.reset()
        steps = 0
        while True:
            #env.render()
            action = agent._forward(agent.thetas, state[np.newaxis, ...], hyperstate_params)[0]
            next_state, cost, done, _ = env.step(action)
            reward = -cost
            steps += 1

            hyperstate_params = update_hyperstate(agent,
                                                  hyperstate_params,
                                                  regression_wrapper_state.hyperparameters.copy(),
                                                  [state, action, reward, next_state, done],
                                                  args.learn_diff)

            tmp_data_buffer.append([state, action, reward, next_state, done])
            total_rewards += float(reward)
            state = next_state.copy()
            if done or steps >= max_steps:
                print('epoch:', epoch, 'total_rewards:', total_rewards)
                data_buffer.extend(tmp_data_buffer)
                break
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--env", type=str, choices=['cartpole', 'double_cartpole', 'pendulum'], default='cartpole')
    parser.add_argument("--train_hp_iterations", type=int, default=2000)
    parser.add_argument("--basis_dim", type=int, default=256)
    parser.add_argument("--basis_dim_reward", type=int, default=600)
    parser.add_argument("--matern_param", type=float, default=np.inf)
    parser.add_argument("--matern_param_reward", type=float, default=np.inf)
    parser.add_argument("--update_hyperstate", type=int, default=0)

    parser.add_argument("--trials", type=int, default=1)

    args = parser.parse_args()
    print(args)

    if args.env == 'cartpole':
        params = cartpole.default_params()
        cost = partial(cartpole.cartpole_loss, **params['cost'])
        env = cartpole.Cartpole(loss_func=cost, **params['plant'])
        max_steps = 25
        maxA = 10.
    elif args.env == 'double_cartpole':
        params = double_cartpole.default_params()
        cost = partial(double_cartpole.double_cartpole_loss, **params['cost'])
        env = double_cartpole.DoubleCartpole(loss_func=cost, **params['plant'])
        max_steps = 30
        maxA = 20.
    elif args.env == 'pendulum':
        params = pendulum.default_params()
        cost = partial(pendulum.pendulum_loss, **params['cost'])
        env = pendulum.Pendulum(loss_func=cost, **params['plant'])
        max_steps = 40
        maxA = 2.5
    else:
        raise Exception('Unknown environment.')


    states, actions, rewards, next_states = get_data2(env, trials=args.trials, max_steps=max_steps, maxA=maxA)
    states_actions = np.concatenate([states, actions], axis=-1)

    predictor = MultiOutputRegressionWrapper(input_dim=env.observation_space.shape[0]+env.action_space.shape[0], output_dim=env.observation_space.shape[0], basis_dim=args.basis_dim, length_scale=1., signal_sd=1., noise_sd=5e-4, prior_sd=1., rffm_seed=1, train_hp_iterations=args.train_hp_iterations, matern_param=args.matern_param)
    predictor._train_hyperparameters(states_actions, next_states)

    '''
    predictors = []
    for i in range(env.observation_space.shape[0]):
        predictors.append(RegressionWrapper2(input_dim=env.observation_space.shape[0]+env.action_space.shape[0], basis_dim=args.basis_dim, length_scale=1.,
                                          signal_sd=1., noise_sd=5e-4, prior_sd=1., rffm_seed=1, train_hp_iterations=args.train_hp_iterations, matern_param=args.matern_param))

    for i in range(env.observation_space.shape[0]):
        predictors[i]._train_hyperparameters(states_actions, next_states[:, i:i+1])
    '''

    while True:
        '''
        for i in range(env.observation_space.shape[0]):
            predictors[i]._reset_statistics(states_actions, next_states[:, i:i+1], bool(args.update_hyperstate))
        '''
        predictor._reset_statistics(states_actions, next_states)

        states2, actions2, rewards2, next_states2 = get_data2(env, trials=1, max_steps=max_steps, maxA=maxA)
        states_actions2 = np.concatenate([states2, actions2], axis=-1)

        plt.figure()

        predict_mu, predict_sigma = predictor._predict(states_actions2)
        for i in range(env.observation_space.shape[0]):
            plt.subplot(3, env.observation_space.shape[0], i+1)
            plt.plot(np.arange(len(next_states2[:, i:i+1])), next_states2[:, i:i+1])
            plt.errorbar(np.arange(len(predict_mu[:, i:i+1])), predict_mu[:, i:i+1], yerr=np.sqrt(predict_sigma), color='m', ecolor='g')
            plt.grid()

        '''
        for i in range(env.observation_space.shape[0]):
            plt.subplot(3, env.observation_space.shape[0], i+1)

            predict_mu, predict_sigma = predictors[i]._predict(states_actions2, False)

            plt.plot(np.arange(len(next_states2[:, i:i+1])), next_states2[:, i:i+1])
            plt.errorbar(np.arange(len(predict_mu)), predict_mu, yerr=np.sqrt(predict_sigma), color='m', ecolor='g')
            plt.grid()
        '''

        traj_reward = []
        traj = []
        no_lines = 50
        state = np.tile(np.copy(states2[0:1, ...]), [no_lines, 1])
        for a in actions2:
            action = np.tile(a[np.newaxis, ...], [no_lines, 1])
            state_action = np.concatenate([state, action], axis=-1)

            predict_mu, predict_sigma = predictor._predict(state_action)
            state = predict_mu + np.sqrt(predict_sigma) * np.random.normal(size=predict_mu.shape)
            '''
            mu_vec = []
            sigma_vec = []
            for i in range(env.observation_space.shape[0]):
                predict_mu, predict_sigma = predictors[i]._predict(state_action, bool(args.update_hyperstate))
                mu_vec.append(predict_mu)
                sigma_vec.append(predict_sigma)
            mu_vec = np.concatenate(mu_vec, axis=-1)
            sigma_vec = np.concatenate(sigma_vec, axis=-1)
            state = np.stack([np.random.multivariate_normal(mu, np.diag(sigma)) for mu, sigma in zip(mu_vec, sigma_vec)], axis=0)
            '''

            state = np.clip(state, env.observation_space.low, env.observation_space.high)
            traj.append(np.copy(state))

            reward = -env.loss_func(state)
            traj_reward.append(reward)

            '''
            for i in range(env.observation_space.shape[0]):
                predictors[i]._update_hyperstate(state_action, state[:, i:i+1], bool(args.update_hyperstate))
            '''

        traj_reward = np.stack(traj_reward, axis=-1)
        traj = np.stack(traj, axis=-1)
        
        plt.subplot(3, 1, 3)
        for j in range(no_lines):
            y = traj_reward[j, :]
            plt.plot(np.arange(len(y)), y, color='r')
        plt.plot(np.arange(len(rewards2)), rewards2)
        plt.grid()

        for i in range(env.observation_space.shape[0]):
            plt.subplot(3, env.observation_space.shape[0], env.observation_space.shape[0]+i+1)
            for j in range(no_lines):
                y = traj[j, i, :]
                plt.plot(np.arange(len(y)), y, color='r')

            plt.plot(np.arange(len(next_states2[..., i])), next_states2[..., i])
            plt.grid()

        plt.show(block=True)
Esempio n. 4
0
    parser.add_argument(
        '-k', '--kwarg', nargs=2, action='append', default=[],
        help='additional arguments for the experiment [name value]')
    args = parser.parse_args()
    e_id = args.exp
    kwargs = dict(args.kwarg)

    # prepare experiment parameters
    scenario_params, pol, dyn, learner_setup = get_scenario(e_id, **kwargs)
    params, loss_kwargs, polopt_kwargs, extra_inps = scenario_params
    if args.horizon:
        params['min_steps'] = args.horizon
    # init cost model
    cost = partial(double_cartpole.double_cartpole_loss, **params['cost'])
    # init environment
    env = double_cartpole.DoubleCartpole(loss_func=cost, **params['plant'])

    # initialize output directory
    odir = args.output_folder
    if args.name is not None:
        name = args.name+'_'+str(e_id)
    else:
        name = env.name+'_'+str(e_id)

    output_folder = os.path.join(odir, name)

    try:
        os.makedirs(output_folder)
    except OSError:
        # move the old stuff
        target_dir = output_folder+'_'+str(os.stat(output_folder).st_ctime)