def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=pos_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([255, 255]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([500, 500])) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1) # Policy 2 pi2 = DeterministicControlPolicy(weights=np.array([0])) mu2 = np.zeros(pi2.weights_size) sigma2 = 1e-3 * np.ones(pi2.weights_size) distribution2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 1 learning_rate1 = params.get('learning_rate_high') lim = 1000 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=mdp.info.gamma, horizon=100) agent1 = alg_high(policy=pi1, mdp_info=mdp_info_agent1, learning_rate=learning_rate1, features=features) # Agent 2 learning_rate2 = params.get('learning_rate_low') mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = alg_low(distribution=distribution2, policy=pi2, mdp_info=mdp_info_agent2, learning_rate=learning_rate2) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=ep_per_run, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(distribution2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) control_block2.add_input(function_block1) control_block2.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=eval_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) low_level_dataset_eval += control_block2.dataset.get() # Save parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset_file', low_level_dataset_eval) np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1) np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, ep_per_fit_high): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (pick distance to goal state var)', phi=pick_first_state) # Function Block 2 function_block2 = fBlock(name='f2 (build state)', phi=angle_to_angle_diff_complete_state) # Function Block 3 function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway) # Function Block 4 function_block4 = addBlock(name='f4 (add block)') # Function Block 5 function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward) # Control Block 1 parameter_callback1 = CollectDistributionParameter(agent_high.distribution) control_block_h = ControlBlock(name='Control Block High', agent=agent_high, n_eps_per_fit=ep_per_fit_high, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(agent_low.distribution) control_block_l = ControlBlock(name='Control Block Low', agent=agent_low, n_eps_per_fit=ep_per_fit_low, callbacks=[parameter_callback2]) control_block_h.set_mask() # Graph blocks = [ state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l, function_block1, function_block2, function_block3, function_block4, function_block5 ] state_ph.add_input(control_block_l) reward_ph.add_input(control_block_l) lastaction_ph.add_input(control_block_l) control_block_h.add_input(function_block1) control_block_h.add_reward(reward_ph) control_block_l.add_input(function_block2) control_block_l.add_reward(function_block4) function_block1.add_input(state_ph) function_block2.add_input(control_block_h) function_block2.add_input(state_ph) function_block3.add_input(function_block2) function_block5.add_input(state_ph) function_block4.add_input(function_block3) function_block4.add_input(function_block5) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph, control_block_h
def segway_experiment(alg_high, alg_low, params_high, params_low): np.random.seed() # Model Block mdp = SegwayLinearMotion(goal_distance=1.0) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (pick distance to goal state var)', phi=pick_first_state) # Function Block 2 function_block2 = fBlock(name='f2 (build state)', phi=angle_to_angle_diff_complete_state) # Function Block 3 function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway) # Function Block 4 function_block4 = addBlock(name='f4 (add block)') # Function Block 5 function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward) # Features approximator1 = Regressor(LinearApproximator, input_shape=(1,), output_shape=(1,)) # Policy H n_weights = approximator1.weights_size mu1 = np.zeros(n_weights) sigma1 = 2.0e-2*np.ones(n_weights) pi1 = DeterministicPolicy(approximator1) dist1 = GaussianDiagonalDistribution(mu1, sigma1) # Agent H lim = np.pi/2 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(-lim, lim, (1,)), gamma=mdp.info.gamma, horizon=mdp.info.horizon) agent_high = alg_high(dist1, pi1, mdp_info_agent1, **params_high) # Policy L approximator2 = Regressor(LinearApproximator, input_shape=(3,), output_shape=(1,)) n_weights2 = approximator2.weights_size mu2 = np.zeros(n_weights2) sigma2 = 2.0*np.ones(n_weights2) pi2 = DeterministicControlPolicy(approximator2) dist2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent Low mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( low=mdp.info.observation_space.low[1:], #FIXME FALSE high=mdp.info.observation_space.high[1:], #FIXME FALSE shape=(3,)), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=mdp.info.horizon) agent_low = alg_low(dist2, pi2, mdp_info_agent2, **params_low) # Control Block 1 parameter_callback1 = CollectDistributionParameter(dist1) control_block1 = ControlBlock(name='Control Block High', agent=agent_high, n_eps_per_fit=n_ep_per_fit*2, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(dist2) control_block2 = ControlBlock(name='Control Block Low', agent=agent_low, n_eps_per_fit=n_ep_per_fit, callbacks=[parameter_callback2]) control_block1.set_mask() # Algorithm blocks = [state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, function_block3, function_block4, function_block5] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(function_block1) control_block1.add_reward(reward_ph) control_block2.add_input(function_block2) control_block2.add_reward(function_block4) function_block1.add_input(state_ph) function_block2.add_input(control_block1) function_block2.add_input(state_ph) function_block3.add_input(function_block2) function_block5.add_input(state_ph) function_block4.add_input(function_block3) function_block4.add_input(function_block5) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) mask_done = False for n in range(n_epochs): print('ITERATION', n) if n == 2: control_block1.unset_mask() core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) print('dist H:', dist1.get_parameters()) print('dist L mu:', dist2.get_parameters()[:3]) print('dist L sigma:', dist2.get_parameters()[3:])
def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = SegwayLinearMotion(goal_distance=1.0) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (pick distance to goal state var)', phi=pick_first_state) # Function Block 2 function_block2 = fBlock(name='f2 (build state)', phi=angle_to_angle_diff_complete_state) # Function Block 3 function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway) # Function Block 4 function_block4 = addBlock(name='f4 (add block)') # Integrator Block error_acc = ErrorAccumulatorBlock(name='error acc') # Features features1 = Features(basis_list=[PolynomialBasis()]) approximator1 = Regressor(LinearApproximator, input_shape=(features1.size, ), output_shape=(1, )) # Policy 1 n_weights = approximator1.weights_size mu1 = np.zeros(n_weights) sigma1 = 2e-0 * np.ones(n_weights) pi1 = DeterministicPolicy(approximator1) dist1 = GaussianDiagonalDistribution(mu1, sigma1) # Agent 1 eps1 = params.get('eps') lim = 2 * np.pi mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (1, )), gamma=mdp.info.gamma, horizon=20) agent1 = alg_low(distribution=dist1, policy=pi1, features=features1, mdp_info=mdp_info_agent1, eps=eps1) # Policy 2 basis = PolynomialBasis.generate(1, 3) features2 = Features(basis_list=basis) approximator2 = Regressor(LinearApproximator, input_shape=(features2.size, ), output_shape=(1, )) n_weights2 = approximator2.weights_size mu2 = np.zeros(n_weights2) sigma2 = 2e-0 * np.ones(n_weights2) pi2 = DeterministicPolicy(approximator2) dist2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 2 mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( low=np.array([-np.pi, -np.pi, -np.pi]), high=np.array([np.pi, np.pi, np.pi]), shape=(3, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=30) agent2 = alg_low(distribution=dist2, policy=pi2, features=features2, mdp_info=mdp_info_agent2, eps=eps1) # Control Block 1 parameter_callback1 = CollectDistributionParameter(dist1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=ep_per_run, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(dist2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=20, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, function_block3, function_block4, error_acc, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_input(function_block1) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) control_block2.add_input(function_block2) control_block2.add_reward(function_block4) function_block1.add_input(state_ph) function_block2.add_input(control_block1) function_block2.add_input(state_ph) function_block3.add_input(function_block2) error_acc.add_input(function_block3) error_acc.add_alarm_connection(control_block2) function_block4.add_input(function_block3) function_block4.add_input(error_acc) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))