def test_PGPE(): distribution = learn(PGPE, learning_rate=AdaptiveParameter(1.5)) w = distribution.get_parameters() w_test = np.array([ 0.02489092, 0.31062211, 0.2051433, 0.05959651, -0.78302236, 0.77381954, 0.23676176, -0.29855654 ]) assert np.allclose(w, w_test)
def experiment(alg, n_runs, n_iterations, ep_per_run, use_tensorflow): np.random.seed() # MDP mdp = ShipSteering() # Policy if use_tensorflow: tensor_list = gaussian_tensor.generate( [3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(tensor_list=tensor_list, name='phi', input_dim=mdp.info.observation_space.shape[0]) else: basis = GaussianRBF.generate([3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(basis_list=basis) input_shape = (phi.size, ) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = np.array([[.05]]) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def test_PGPE(): distribution = GaussianDiagonalDistribution(mu, sigma) agent = PGPE(distribution, policy, mdp.info, learning_rate=AdaptiveParameter(1.5), features=phi) agent.episode_start() agent.fit(dataset) w_1 = .54454343 w_2 = .5449792 w = agent.policy.get_weights() assert np.allclose(w_1, w[10]) assert np.allclose(w_2, w[18])
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(policy, mdp.info, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alg, n_runs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = .1 * np.eye(1) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
ep_per_run_hier = ep_per_epoch // n_iterations_hier print('High: ', alg_h.__name__, ' Low: ', alg_l.__name__) res = Parallel(n_jobs=n_jobs)(delayed(hierarchical_experiment) (mdp, agent_l, agent_h, n_epochs, n_iterations_hier, ep_per_run_hier, ep_per_eval, ep_per_run_low) for _ in range(how_many)) J, L = parse_joblib(res) np.save(subdir + '/J_H_' + alg_h.__name__ + '_' + alg_l.__name__, J) np.save(subdir + '/L_H_' + alg_h.__name__ + '_' + alg_l.__name__, L)''' # GHAVAMZADEH params_high = {'learning_rate': Parameter(value=8e-2), 'lambda_coeff': 0.9} agent_high = build_high_level_ghavamzadeh(QLambdaDiscrete, params_high, mdp) params_low = {'learning_rate': AdaptiveParameter(value=1e-2)} agent_cross = build_low_level_ghavamzadeh(GPOMDP, params_low, mdp) agent_plus = build_low_level_ghavamzadeh(GPOMDP, params_low, mdp) print('ghavamzadeh') res = Parallel(n_jobs=n_jobs)(delayed(ghavamzadeh_experiment)( mdp, agent_plus, agent_cross, agent_high, n_epochs, ep_per_epoch, ep_per_eval, ep_per_run_low_ghavamzadeh) for _ in range(how_many)) J, L = parse_joblib(res) np.save(subdir + '/J_ghavamzadeh', J) np.save(subdir + '/L_ghavamzadeh', L)
name='phi', input_dim=mdp.info.observation_space.shape[0]) input_shape = (phi.size, ) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = np.eye(2) * 1e-1 policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=5) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = REINFORCE(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) print 'Initial evaluation' dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): print 'iteration', i print 'learn'
mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset_file', low_level_dataset_eval) np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1) np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval) if __name__ == '__main__': subdir = datetime.datetime.now().strftime( '%Y-%m-%d_%H-%M-%S') + '_big_hierarchical/' alg_high = GPOMDP alg_low = PGPE learning_rate_high = AdaptiveParameter(value=50) learning_rate_low = AdaptiveParameter(value=5e-4) n_jobs = -1 how_many = 1 n_runs = 25 n_iterations = 20 ep_per_run = 40 eval_run = 50 mk_dir_recursive('./' + subdir) force_symlink('./' + subdir, 'latest') params = { 'learning_rate_high': learning_rate_high, 'learning_rate_low': learning_rate_low } np.save(subdir + '/algorithm_params_dictionary', params)
print('ITERATION', n) if n == 2: control_block1.unset_mask() core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) print('dist H:', dist1.get_parameters()) print('dist L mu:', dist2.get_parameters()[:3]) print('dist L sigma:', dist2.get_parameters()[3:]) if __name__ == '__main__': learning_rate_high = Parameter(value=1e-5) learning_rate_low = AdaptiveParameter(value=1e-1) eps_high = 0.05 eps_low = 0.05 beta_high = 0.01 beta_low = 2e-3 algs_params = [ #(REPS, REPS, {'eps': eps_high}, {'eps': eps_low}), (RWR, RWR, {'beta': beta_high}, {'beta': beta_low}), #(PGPE, PGPE, {'learning_rate': learning_rate_high}, #{'learning_rate': learning_rate_low}), #(PGPE, RWR, {'learning_rate' : learning_rate_high}, {'beta' : # beta_low}) ] n_jobs = 1
def experiment(): np.random.seed() # Model Block mdp = ShipSteeringMultiGate() #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=phi) # Function Block 2 function_block2 = squarednormBlock(name='f2 (squared norm)') # Function Block 3 function_block3 = addBlock(name='f3 (summation)') #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([38, 38]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([75, 75])) pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1) # Policy 2 sigma2 = Parameter(value=.01) approximator2 = Regressor(LinearApproximator, input_shape=(1, ), output_shape=mdp.info.action_space.shape) pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2) # Agent 1 learning_rate = AdaptiveParameter(value=10) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, 150, (2, )), gamma=mdp.info.gamma, horizon=50) agent1 = GPOMDP(policy=pi1, mdp_info=mdp_info_agent1, params=agent_params, features=features) # Agent 2 learning_rate = AdaptiveParameter(value=.001) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = GPOMDP(policy=pi2, mdp_info=mdp_info_agent2, params=agent_params, features=None) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=5, callbacks=[parameter_callback1]) # Control Block 2 dataset_callback = CollectDataset() parameter_callback2 = CollectPolicyParameter(pi2) control_block2 = ControlBlock( name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[dataset_callback, parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, control_block1, control_block2, function_block1, function_block2, function_block3, reward_acc ] #order = [0, 1, 7, 2, 4, 5, 6, 3] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) function_block3.add_input(function_block2) function_block3.add_input(reward_ph) control_block2.add_input(function_block1) control_block2.add_reward(function_block3) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train #dataset_learn_visual = core.learn(n_episodes=2000) dataset_learn_visual = list() for n in range(4): dataset_learn = core.learn(n_episodes=500) last_ep_dataset = pick_last_ep(dataset_learn) dataset_learn_visual += last_ep_dataset del dataset_learn # Evaluate dataset_eval = core.evaluate(n_episodes=10) # Visualize low_level_dataset = dataset_callback.get() parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() visualize_policy_params(parameter_dataset1, parameter_dataset2) visualize_control_block(low_level_dataset, ep_count=20) visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4) visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4) plt.show() return
if __name__ == '__main__': how_many = 100 n_jobs = -1 n_runs = 25 n_iterations = 10 ep_per_run = 20 algs_and_params = [ (REPS, { 'eps': 1.0 }), (RWR, { 'beta': 0.7 }), (PGPE, { 'learning_rate': AdaptiveParameter(value=1.5) }), ] base_dir = datetime.datetime.now().strftime( '%Y-%m-%d_%H-%M-%S') + '_small_bbo/' mk_dir_recursive('./' + base_dir) force_symlink(base_dir, './latest') for alg, params in algs_and_params: subdir = base_dir + alg.__name__ + '/' mk_dir_recursive('./' + subdir) np.save(subdir + '/algorithm_params_dictionary', params) experiment_params = { 'how_many': how_many,
def test_GPOMDP(): params = dict(learning_rate=AdaptiveParameter(value=.01)) policy = learn(GPOMDP, params) w = np.array([-0.07623939, 2.05232858]) assert np.allclose(w, policy.get_weights())
print(alg.__name__) res = Parallel(n_jobs=n_jobs)(delayed(flat_experiment)(mdp, agent, n_epochs, n_iterations_bbo, ep_per_run_bbo, ep_per_eval) for _ in range(how_many)) J, L = parse_joblib(res) np.save(subdir + '/J_' + alg.__name__, J) np.save(subdir + '/L_' + alg.__name__, L)''' # HIERARCHICAL algs_and_params_hier = [ (GPOMDP, {'learning_rate': AdaptiveParameter(value=10)}, PGPE, {'learning_rate': AdaptiveParameter(value=5e-4)}) ] mu = np.array([75, 75]) sigma = np.array([40, 40]) for alg_h, params_h, alg_l, params_l in algs_and_params_hier: agent_h = build_high_level_agent(alg_h, params_h, mdp, mu, sigma) agent_l = build_low_level_agent(alg_l, params_l, mdp) ep_per_run_hier = ep_per_epoch // n_iterations_hier print('High: ', alg_h.__name__, ' Low: ', alg_l.__name__) res = Parallel(n_jobs=n_jobs)(delayed(hierarchical_experiment) (mdp, agent_l, agent_h,
def test_REINFORCE(): params = dict(learning_rate=AdaptiveParameter(value=.01)) policy = learn(REINFORCE, params) w = np.array([-0.0084793, 2.00536528]) assert np.allclose(w, policy.get_weights())
# MDP mdp = ShipSteeringMultiGate(n_steps_action=3, small=True) # directory name = 'multigate_ship_steering' subdir = name + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')\ + '/' mk_dir_recursive('./' + subdir) force_symlink('./' + subdir, name + '_latest') # Hierarchical algs_and_params_hier = [ (GPOMDP, {'learning_rate': Parameter(value=1e-5)}, PGPE, {'learning_rate': AdaptiveParameter(value=5e-4)}) ] for alg_h, params_h, alg_l, params_l in algs_and_params_hier: mu = 0 sigma = 0.15 agent_h = build_high_level_agent(alg_h, params_h, mdp, mu, sigma) agent_l = build_low_level_agent(alg_l, params_l, mdp) ep_per_run_hier = ep_per_epoch_train // n_iterations print('High: ', alg_h.__name__, ' Low: ', alg_l.__name__)
mdp = ShipSteeringMultiGate(n_steps_action=3, small=True) # directory name = 'multigate_ship_steering' subdir = name + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')\ + '/' mk_dir_recursive('./' + subdir) force_symlink('./' + subdir, name + '_latest') # Hierarchical algs_and_params_hier = [(QLearning, { 'learning_rate': Parameter(value=0.6) }, GPOMDP, { 'learning_rate': AdaptiveParameter(value=25) if mdp.small else AdaptiveParameter( value=50) }, PGPE, { 'learning_rate': AdaptiveParameter(value=5e-4) })] for alg_h, params_h, alg_m, params_m, alg_l, params_l in algs_and_params_hier: epsilon = Parameter(value=1) agent_h = build_high_level_agent(alg_h, params_h, mdp, epsilon) mu = 250 if mdp.small else 500 sigma = 125 if mdp.small else 250 agent_m1 = build_mid_level_agent(alg_m, params_m, mdp, mu, sigma) agent_m2 = build_mid_level_agent(alg_m, params_m, mdp, mu, sigma) agent_m3 = build_mid_level_agent(alg_m, params_m, mdp, mu, sigma)
def test_eNAC(): params = dict(learning_rate=AdaptiveParameter(value=.01)) policy = learn(eNAC, params) w = np.array([-0.03668018, 2.05112355]) assert np.allclose(w, policy.get_weights())