def test_smoke(): dmp_seq, _ = create_dmp_seq(n_task_dims=1) controller = Controller({"Controller": {"record_inputs": True}, "Environment": {"type": "bolero.environment.OptimumTrajectory", "x0": np.zeros(1), "g": np.ones(1), "execution_time": 1.0, "dt": 0.01}}) controller.episode_with(dmp_seq) params = np.random.randn(dmp_seq.get_n_params()) dmp_seq.set_params(params) assert_equal(dmp_seq.get_n_params(), dmp_seq.get_params().size) assert_equal(dmp_seq.n_weights, 15) assert_equal(dmp_seq.get_n_params(), 21) params_copy = dmp_seq.get_params() assert_array_almost_equal(params, params_copy) new_subgoal = [0.5] dmp_seq.set_subgoal(2, new_subgoal) subgoal = dmp_seq.get_subgoal(2) assert_array_almost_equal(new_subgoal, subgoal) new_subgoal_velocity = [0.0] dmp_seq.set_subgoal_velocity(2, new_subgoal_velocity) subgoal_velocity = dmp_seq.get_subgoal_velocity(2) assert_array_almost_equal(new_subgoal_velocity, subgoal_velocity) X = np.array(controller.inputs_[0]) assert_equal(len(X), 101) assert_almost_equal(X[0, 0], 0.0) assert_almost_equal(X[20, 0], 0.5, places=2) assert_almost_equal(X[50, 0], 1.0, places=2) assert_almost_equal(X[100, 0], 2.0, places=2)
def test_missing_behavior_search(): ctrl = Controller(environment=ObjectiveFunction()) beh = DummyBehavior(initial_params=np.array([0.0, 0.0])) beh.init(0, 2) feedback = ctrl.episode_with(beh) assert_equal(len(feedback), 1) assert_less(feedback[0], ctrl.environment.get_maximum_feedback())
def learn(setup_fun, variance): #ik, beh, mp_keys, mp_values = cfg.make_approx_cart_dmp(cfg.x0, cfg.g, cfg.execution_time, cfg.dt) #ik, beh, mp_keys, mp_values = cfg.make_exact_cart_dmp(cfg.x0, cfg.g, cfg.execution_time, cfg.dt) ik, beh, mp_keys, mp_values = cfg.make_joint_dmp(cfg.x0, cfg.g, cfg.execution_time, cfg.dt) env = Pendulum(x0=cfg.x0, g=cfg.g, execution_time=cfg.execution_time, dt=cfg.dt) opt = CMAESOptimizer(variance=variance, random_state=0) bs = BlackBoxSearch(beh, opt) controller = Controller(environment=env, behavior_search=bs, n_episodes=n_episodes, verbose=2) rewards = controller.learn(mp_keys, mp_values) best = bs.get_best_behavior() best_params = best.get_params() np.save("best_params_pendulum_joint.npy", best_params) reward = controller.episode_with(best) ax = env.plot() plt.show()
def learn(name, setup_fun, run): ik, beh, mp_keys, mp_values = setup_fun(cfg.x0, cfg.g, cfg.execution_time, cfg.dt) env = ViaPointEnvironment(ik, cfg.x0, cfg.via_points, cfg.execution_time, cfg.dt, cfg.qlo, cfg.qhi, penalty_vel=cfg.penalty_vel, penalty_acc=cfg.penalty_acc, penalty_via_point=cfg.penalty_via_point) opt = CMAESOptimizer(variance=cfg.variance[name], random_state=run) bs = BlackBoxSearch(beh, opt) controller = Controller(environment=env, behavior_search=bs, n_episodes=n_episodes, verbose=0) rewards = controller.learn(mp_keys, mp_values) best = bs.get_best_behavior() reward = controller.episode_with(best) return name, rewards, reward.sum()
def test_mc_rl(): env = OpenAiGym("FrozenLake-v0", render=False, seed=1) try: env.init() except ImportError: raise SkipTest("gym is not installed") bs = MonteCarloRL(env.get_discrete_action_space(), random_state=1) ctrl = Controller(environment=env, behavior_search=bs, n_episodes=10000, finish_after_convergence=True) returns = ctrl.learn() assert_less(len(returns), 1000) beh = bs.get_best_behavior() rewards = ctrl.episode_with(beh) assert_equal(sum(rewards), 1.0)
def learn(name, run, setup_fun, variance): ik, beh, mp_keys, mp_values = setup_fun( cfg.x0, cfg.g, cfg.execution_time, cfg.dt) env = Pendulum( x0=cfg.x0, g=cfg.g, execution_time=cfg.execution_time, dt=cfg.dt ) opt = CMAESOptimizer(variance=variance, random_state=run) bs = BlackBoxSearch(beh, opt) controller = Controller(environment=env, behavior_search=bs, n_episodes=n_episodes, verbose=2) rewards = controller.learn(mp_keys, mp_values) best = bs.get_best_behavior() reward = controller.episode_with(best) return name, rewards, reward.sum()
log_to_stdout=True) if os.path.exists("initial_params.txt"): initial_params = np.loadtxt("initial_params.txt") else: initial_params = None opt = CMAESOptimizer(initial_params=initial_params, variance=cfg.variance["approxik"], random_state=0) bs = BlackBoxSearch(beh, opt) controller = Controller(environment=env, behavior_search=bs, n_episodes=1000, verbose=2) rewards = controller.learn(mp_keys, mp_values) best = bs.get_best_behavior() best_params = best.get_params() np.save("best_params_viapoint_joint.npy", best_params) reward = controller.episode_with(best) print(reward.sum()) plt.plot(rewards) ax = env.plot() ax.view_init(azim=-110, elev=30) ax.set_xticks((-0.3, 0.0, 0.3)) ax.set_yticks((0.0, -0.3, -0.6)) ax.set_zticks((0.3, 0.6, 0.9)) plt.savefig("viapoints.pdf") plt.show()
to solve the problem and policy search algorithm usually work very well in this domain. """ print(__doc__) import numpy as np import matplotlib.pyplot as plt from bolero.environment import OpenAiGym from bolero.behavior_search import BlackBoxSearch from bolero.optimizer import CMAESOptimizer from bolero.representation import LinearBehavior from bolero.controller import Controller beh = LinearBehavior() env = OpenAiGym("CartPole-v0", render=False, seed=0) opt = CMAESOptimizer(variance=10.0**2, random_state=0) bs = BlackBoxSearch(beh, opt) controller = Controller(environment=env, behavior_search=bs, n_episodes=300) rewards = controller.learn() controller.episode_with(bs.get_best_behavior()) plt.figure() ax = plt.subplot(111) ax.set_title("Optimization progress") ax.plot(rewards) ax.set_xlabel("Episode") ax.set_ylabel("Reward") ax.set_ylim(-10, 210) plt.show()
penalty_start_dist=10000.0, penalty_obstacle=1000.0, penalty_length=10., hide_acc_from_interface=True, use_covar=True) opt = CMAESOptimizer(variance=0.1**2, random_state=0, initial_params=beh.get_params()) bs = BlackBoxSearch(beh, opt) controller = Controller(environment=env, behavior_search=bs, n_episodes=n_episodes, record_inputs=True) rewards = controller.learn(["x0", "g"], [x0, g]) controller.episode_with(bs.get_best_behavior(), ["x0", "g"], [x0, g]) X = np.asarray(controller.inputs_[-1]) X_hist = np.asarray(controller.inputs_) plt.figure(figsize=(8, 5)) ax = plt.subplot(121) ax.set_title("Optimization progress") ax.plot(rewards) ax.set_xlabel("Episode") ax.set_ylabel("Reward") ax = plt.subplot(122, aspect="equal") ax.set_title("Learned trajectory") plot_covariance(ax, X[:, :2], np.array(X[:, 4:]).reshape(-1, 4, 4)) env.plot(ax)
best_params = np.load("best_params_viapoint_%s.npy" % name) beh.set_params(best_params) controller = Controller(environment=env, verbose=2) initial_param_value = best_params[modified_parameter] offset = np.linspace(-param_range / 2.0, param_range / 2.0, n_steps) returns = np.empty(n_steps) for i in range(n_steps): params = np.copy(best_params) params[modified_parameter] = initial_param_value + offset[i] beh.set_params(params) beh.reset() rewards = controller.episode_with(beh, mp_keys, mp_values) returns[i] = np.sum(rewards) plt.plot(offset, returns, label=labels[name], ls=linestyles[name], lw=2, c="k") plt.xlabel("Weight offset") plt.ylabel("Reward") plt.xticks(np.linspace(-param_range / 2.0, param_range / 2.0, 5)) plt.yticks(np.linspace(-125, 0, 6)) plt.ylim((-125, 0)) plt.legend(loc="best")
for n_task_dims in test_task_dims: dmp_seq, subgoals = create_dmp_seq(n_task_dims=n_task_dims) dmp_seq.set_params(dmp_seq.get_params()) for i in range(len(subgoals)): assert_equal(len(dmp_seq.get_subgoal_velocity(i)), n_task_dims) if __name__ == "__main__": import matplotlib.pyplot as plt dmp_seq, _ = create_dmp_seq(2) controller = Controller({"Controller": {"record_outputs": True}, "Environment": {"type": "bolero.environment.OptimumTrajectory", "x0": np.zeros(2), "g": np.ones(2), "execution_time": 1.0, "dt": 0.01}}, record_inputs=True) controller.episode_with(dmp_seq) X = np.array(controller.outputs_)[0] plt.figure() plt.plot(X[:, 0], X[:, 1]) plt.figure() plt.plot(X[:, 2]) plt.figure() plt.plot(X[:, 3]) plt.show()