def test_pytorch_approximator(): np.random.seed(88) torch.manual_seed(88) noise = 1e-3**2 a = np.random.rand(1000, 4) k = np.random.rand(4, 2) b = np.sin(a).dot(k) + np.random.randn(1000, 2) * noise approximator = Regressor(PyTorchApproximator, input_shape=(4, ), output_shape=(2, ), network=ExampleNet, optimizer={ 'class': optim.Adam, 'params': {} }, loss=F.mse_loss, n_neurons=100, n_hidden=1, n_epochs=200, batch_size=100, quiet=True) approximator.fit(a, b) bhat = approximator.predict(a) error = np.linalg.norm(b - bhat, 'fro') / 1000 error_inf = np.max(np.abs(b - bhat)) print(b[:10]) print(bhat[:10]) print(error_inf) assert error < 2e-4 gradient = approximator.diff(a[0]) assert gradient.shape[1] == 2 old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights)) bhat_random = approximator.predict(a) assert not np.array_equal(bhat, bhat_random)
def test_pytorch_approximator(): np.random.seed(1) torch.manual_seed(1) n_actions = 2 s = np.random.rand(1000, 4) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) approximator = Regressor(TorchApproximator, input_shape=(4,), output_shape=(2,), n_actions=n_actions, network=ExampleNet, optimizer={'class': optim.Adam, 'params': {}}, loss=F.mse_loss, batch_size=100, quiet=True) approximator.fit(s, a, q, n_epochs=20) x_s = np.random.rand(2, 4) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a) y_test = np.array([0.37191153, 0.5920861]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.47908658, 0.37191153], [0.5920861, 0.27575058]]) assert np.allclose(y, y_test) gradient = approximator.diff(x_s[0], x_a[0]) gradient_test = np.array([0., 0., 0., 0., 0.02627479, 0.76513696, 0.6672573, 0.35979462, 0., 1.]) assert np.allclose(gradient, gradient_test) gradient = approximator.diff(x_s[0]) gradient_test = np.array([[0.02627479, 0.], [0.76513696, 0.], [0.6672573, 0.], [0.35979462, 0.], [0., 0.02627479], [0., 0.76513696], [0., 0.6672573], [0., 0.35979462], [1, 0.], [0., 1.]]) assert np.allclose(gradient, gradient_test) old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights))
def build_high_level_agent(alg, params, mdp, mu, sigma): features = Features(basis_list=[PolynomialBasis()]) approximator = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator.set_weights(mu) pi1 = DiagonalGaussianPolicy(mu=approximator, std=sigma) lim = mdp.info.observation_space.high[0] mdp_info_agent = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=1.0, horizon=100) agent = alg(pi1, mdp_info_agent, features=features, **params) return agent
def build_high_level_agent(alg, params, mdp, mu, std): tilings = Tiles.generate(n_tilings=1, n_tiles=[10, 10], low=mdp.info.observation_space.low[:2], high=mdp.info.observation_space.high[:2]) features = Features(tilings=tilings) input_shape = (features.size, ) mu_approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) std_approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) w_mu = mu * np.ones(mu_approximator.weights_size) mu_approximator.set_weights(w_mu) w_std = std * np.ones(std_approximator.weights_size) mu_approximator.set_weights(w_std) pi = StateLogStdGaussianPolicy(mu=mu_approximator, log_std=std_approximator) obs_low = np.array( [mdp.info.observation_space.low[0], mdp.info.observation_space.low[1]]) obs_high = np.array([ mdp.info.observation_space.high[0], mdp.info.observation_space.high[1] ]) mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(obs_low, obs_high, shape=(2, )), action_space=spaces.Box( mdp.info.observation_space.low[2], mdp.info.observation_space.high[2], shape=(1, )), gamma=1, horizon=10) agent = alg(policy=pi, mdp_info=mdp_info_agent1, features=features, **params) return agent
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(policy, mdp.info, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def build_mid_level_agent(alg, params, mdp, mu, std): mu_approximator = Regressor(LinearApproximator, input_shape=(1, ), output_shape=(2, )) w_mu = mu * np.ones(mu_approximator.weights_size) mu_approximator.set_weights(w_mu) pi = DiagonalGaussianPolicy(mu=mu_approximator, std=std * np.ones(2)) lim = mdp.info.observation_space.high[0] basis = PolynomialBasis() features = BasisFeatures(basis=[basis]) mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(0, 1, (1, )), action_space=spaces.Box(0, lim, (2, )), gamma=1, horizon=10) agent = alg(policy=pi, mdp_info=mdp_info_agent1, features=features, **params) return agent
def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=pos_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([255, 255]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([500, 500])) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1) # Policy 2 pi2 = DeterministicControlPolicy(weights=np.array([0])) mu2 = np.zeros(pi2.weights_size) sigma2 = 1e-3 * np.ones(pi2.weights_size) distribution2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 1 learning_rate1 = params.get('learning_rate_high') lim = 1000 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=mdp.info.gamma, horizon=100) agent1 = alg_high(policy=pi1, mdp_info=mdp_info_agent1, learning_rate=learning_rate1, features=features) # Agent 2 learning_rate2 = params.get('learning_rate_low') mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = alg_low(distribution=distribution2, policy=pi2, mdp_info=mdp_info_agent2, learning_rate=learning_rate2) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=ep_per_run, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(distribution2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) control_block2.add_input(function_block1) control_block2.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=eval_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) low_level_dataset_eval += control_block2.dataset.get() # Save parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset_file', low_level_dataset_eval) np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1) np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
def experiment(): np.random.seed() # Model Block mdp = ShipSteeringMultiGate() #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=phi) # Function Block 2 function_block2 = squarednormBlock(name='f2 (squared norm)') # Function Block 3 function_block3 = addBlock(name='f3 (summation)') #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([38, 38]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([75, 75])) pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1) # Policy 2 sigma2 = Parameter(value=.01) approximator2 = Regressor(LinearApproximator, input_shape=(1, ), output_shape=mdp.info.action_space.shape) pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2) # Agent 1 learning_rate = AdaptiveParameter(value=10) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, 150, (2, )), gamma=mdp.info.gamma, horizon=50) agent1 = GPOMDP(policy=pi1, mdp_info=mdp_info_agent1, params=agent_params, features=features) # Agent 2 learning_rate = AdaptiveParameter(value=.001) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = GPOMDP(policy=pi2, mdp_info=mdp_info_agent2, params=agent_params, features=None) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=5, callbacks=[parameter_callback1]) # Control Block 2 dataset_callback = CollectDataset() parameter_callback2 = CollectPolicyParameter(pi2) control_block2 = ControlBlock( name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[dataset_callback, parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, control_block1, control_block2, function_block1, function_block2, function_block3, reward_acc ] #order = [0, 1, 7, 2, 4, 5, 6, 3] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) function_block3.add_input(function_block2) function_block3.add_input(reward_ph) control_block2.add_input(function_block1) control_block2.add_reward(function_block3) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train #dataset_learn_visual = core.learn(n_episodes=2000) dataset_learn_visual = list() for n in range(4): dataset_learn = core.learn(n_episodes=500) last_ep_dataset = pick_last_ep(dataset_learn) dataset_learn_visual += last_ep_dataset del dataset_learn # Evaluate dataset_eval = core.evaluate(n_episodes=10) # Visualize low_level_dataset = dataset_callback.get() parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() visualize_policy_params(parameter_dataset1, parameter_dataset2) visualize_control_block(low_level_dataset, ep_count=20) visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4) visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4) plt.show() return
def test_linear_approximator(): np.random.seed(1) # Generic regressor a = np.random.rand(1000, 3) k = np.random.rand(3, 2) b = a.dot(k) + np.random.randn(1000, 2) approximator = Regressor(LinearApproximator, input_shape=(3, ), output_shape=(2, )) approximator.fit(a, b) x = np.random.rand(2, 3) y = approximator.predict(x) y_test = np.array([[0.57638247, 0.1573216], [0.11388247, 0.24123678]]) assert np.allclose(y, y_test) point = np.random.randn(3, ) derivative = approximator.diff(point) lp = len(point) for i in range(derivative.shape[1]): assert (derivative[i * lp:(i + 1) * lp, i] == point).all() old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights)) # Action regressor + Ensemble n_actions = 2 s = np.random.rand(1000, 3) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) approximator = Regressor(LinearApproximator, input_shape=(3, ), n_actions=n_actions, n_models=5) approximator.fit(s, a, q) x_s = np.random.rand(2, 3) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a, prediction='mean') y_test = np.array([0.49225698, 0.69660881]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='sum') y_test = np.array([2.46128492, 3.48304404]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='min') y_test = np.array([[0.49225698, 0.69660881]]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.49225698, 0.44154141], [0.69660881, 0.69060195]]) assert np.allclose(y, y_test) approximator = Regressor(LinearApproximator, input_shape=(3, ), n_actions=n_actions) approximator.fit(s, a, q) gradient = approximator.diff(x_s[0], x_a[0]) gradient_test = np.array([0.88471362, 0.11666548, 0.45466254, 0., 0., 0.]) assert np.allclose(gradient, gradient_test)