def experiment_ghavamzade(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # FeaturesH low_hi = 0 lim_hi = 1000 + 1e-8 n_tiles_high = [20, 20] n_tilings = 1 # Discretization Block discretization_block = DiscretizationBlock(low=low_hi, high=lim_hi, n_tiles=n_tiles_high) # PolicyH epsilon = Parameter(value=0.1) piH = EpsGreedy(epsilon=epsilon) # AgentH learning_rate = params.get('learning_rate_high') mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete( n_tiles_high[0] * n_tiles_high[1]), action_space=spaces.Discrete(8), gamma=1, horizon=10000) agentH = alg_high(policy=piH, mdp_info=mdp_info_agentH, learning_rate=learning_rate, lambda_coeff=0.9) epsilon_update = EpsilonUpdate(piH) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agentH, n_steps_per_fit=1) #FeaturesL high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 10] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 3 tilingsL = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) featuresL = Features(tilings=tilingsL) mdp_info_agentL = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )), action_space=mdp.info.action_space, gamma=0.99, horizon=10000) # Approximators input_shape = (featuresL.size, ) approximator_params = dict(input_dim=input_shape[0]) approximator1 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) approximator2 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) # Policy1 std1 = np.array([3e-2]) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1) # Policy2 std2 = np.array([3e-2]) pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2) # Agent1 learning_rate1 = params.get('learning_rate_low') agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL) # Agent2 learning_rate2 = params.get('learning_rate_low') agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL) #Termination Conds termination_condition1 = TerminationCondition(active_dir='+') termination_condition2 = TerminationCondition(active_dir='x') low_ep_per_fit = params.get('low_ep_per_fit') # Control Block + control_block_plus = ControlBlock( name='control block 1', agent=agent1, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition1) # Control Block x control_block_cross = ControlBlock( name='control block 2', agent=agent2, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate, name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low, name='f7 G_lo') #Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma, name='reward_acc_H') # Selector Block function_block8 = fBlock(phi=selector_function, name='f7 G_lo') #Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block_plus]) mux_block.add_block_list([control_block_cross]) #Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, function_block8, reward_acc_H, discretization_block ] reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block_plus) reward_acc_H.add_alarm_connection(control_block_cross) control_blockH.add_input(discretization_block) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block_plus) control_blockH.add_alarm_connection(control_block_cross) mux_block.add_input(function_block8) mux_block.add_input(function_block2) control_block_plus.add_reward(function_block5) control_block_cross.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block_plus) function_block3.add_alarm_connection(control_block_cross) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) function_block8.add_input(control_blockH) discretization_block.add_input(function_block1) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval1 = list() low_level_dataset_eval2 = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) # print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) dataset_eval += dataset_eval_run print('J at start : ' + str(np.mean(J))) for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run dataset_plus = control_block_plus.dataset.get() J_plus = compute_J(dataset_plus, mdp.info.gamma) dataset_cross = control_block_cross.dataset.get() J_cross = compute_J(dataset_cross, mdp.info.gamma) low_level_dataset_eval1.append(dataset_plus) low_level_dataset_eval2.append(dataset_cross) print('J ll PLUS at iteration ' + str(n) + ': ' + str(np.mean(J_plus))) print('J ll CROSS at iteration ' + str(n) + ': ' + str(np.mean(J_cross))) if n == 4: control_blockH.callbacks = [epsilon_update] # Tile data hi_lev_params = agentH.Q.table max_q_val = np.zeros(n_tiles_high[0]**2) act_max_q_val = np.zeros(n_tiles_high[0]**2) for n in range(n_tiles_high[0]**2): max_q_val[n] = np.amax(hi_lev_params[n]) act_max_q_val[n] = np.argmax(hi_lev_params[n]) mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset1_file', low_level_dataset_eval1) np.save(subdir + str(i) + '/low_level_dataset2_file', low_level_dataset_eval2) np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val) np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval) return
def build_computational_graph(mdp, agent_low, agent_m0, agent_m1, agent_m2, agent_m3, agent_high, ep_per_fit_low, ep_per_fit_mid): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 0 function_block0 = fBlock(name='f0 (state build for high level)', phi=hi_lev_state) # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=pos_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) # External reward block reward_m0 = MidReward(gate_no=0) reward_m1 = MidReward(gate_no=1) reward_m2 = MidReward(gate_no=2) reward_m3 = MidReward(gate_no=3) reward_blockm0 = fBlock(name='rm1 (reward m1)', phi=reward_m0) reward_blockm1 = fBlock(name='rm2 (reward m2)', phi=reward_m1) reward_blockm2 = fBlock(name='rm3 (reward m3)', phi=reward_m2) reward_blockm3 = fBlock(name='rm4 (reward m4)', phi=reward_m3) # Control Block H control_block_h = ControlBlock(name='Control Block H', agent=agent_high, n_steps_per_fit=1) # Cotrol Block M1 termination_condition_m1 = TerminationCondition(gate_no=0) control_block_m0 = ControlBlock( name='Control Block M0', agent=agent_m0, n_eps_per_fit=ep_per_fit_mid, termination_condition=termination_condition_m1) # Cotrol Block M2 termination_condition_m2 = TerminationCondition(gate_no=1) control_block_m1 = ControlBlock( name='Control Block M1', agent=agent_m1, n_eps_per_fit=ep_per_fit_mid, termination_condition=termination_condition_m2) # Cotrol Block M3 termination_condition_m3 = TerminationCondition(gate_no=2) control_block_m2 = ControlBlock( name='Control Block M2', agent=agent_m2, n_eps_per_fit=ep_per_fit_mid, termination_condition=termination_condition_m3) # Cotrol Block M4 termination_condition_m4 = TerminationCondition(gate_no=3) control_block_m3 = ControlBlock( name='Control Block M3', agent=agent_m3, n_eps_per_fit=ep_per_fit_mid, termination_condition=termination_condition_m4) # Control Block L termination_condition_low = TerminationConditionLow(mdp.small) control_block_l = ControlBlock( name='Control Block L', agent=agent_low, n_eps_per_fit=ep_per_fit_low, termination_condition=termination_condition_low) # Selector Block mux_block = MuxBlock(name='Mux Block') mux_block.add_block_list([control_block_m0]) mux_block.add_block_list([control_block_m1]) mux_block.add_block_list([control_block_m2]) mux_block.add_block_list([control_block_m3]) # Reward Accumulators reward_acc = mean_reward_block(name='reward_acc_h') reward_acc_m0 = reward_accumulator_block(gamma=mdp.info.gamma, name='reward_acc_m0') reward_acc_m1 = reward_accumulator_block(gamma=mdp.info.gamma, name='reward_acc_m1') reward_acc_m2 = reward_accumulator_block(gamma=mdp.info.gamma, name='reward_acc_m2') reward_acc_m3 = reward_accumulator_block(gamma=mdp.info.gamma, name='reward_acc_m3') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block_h, reward_acc, control_block_l, function_block0, function_block1, function_block2, reward_blockm0, reward_blockm1, reward_blockm2, reward_blockm3, reward_acc_m0, reward_acc_m1, reward_acc_m2, reward_acc_m3, mux_block ] state_ph.add_input(control_block_l) reward_ph.add_input(control_block_l) lastaction_ph.add_input(control_block_l) control_block_h.add_input(function_block0) control_block_h.add_reward(reward_acc) control_block_h.add_alarm_connection(control_block_m0) control_block_h.add_alarm_connection(control_block_m1) control_block_h.add_alarm_connection(control_block_m2) control_block_h.add_alarm_connection(control_block_m3) mux_block.add_input(control_block_h) mux_block.add_input(state_ph) control_block_m0.add_reward(reward_acc_m0) control_block_m0.add_alarm_connection(control_block_l) control_block_m1.add_reward(reward_acc_m1) control_block_m1.add_alarm_connection(control_block_l) control_block_m2.add_reward(reward_acc_m2) control_block_m2.add_alarm_connection(control_block_l) control_block_m3.add_reward(reward_acc_m3) control_block_m3.add_alarm_connection(control_block_l) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block_m0) reward_acc.add_alarm_connection(control_block_m1) reward_acc.add_alarm_connection(control_block_m2) reward_acc.add_alarm_connection(control_block_m3) reward_acc_m0.add_input(reward_blockm0) reward_acc_m0.add_alarm_connection(control_block_l) reward_acc_m1.add_input(reward_blockm1) reward_acc_m1.add_alarm_connection(control_block_l) reward_acc_m2.add_input(reward_blockm2) reward_acc_m2.add_alarm_connection(control_block_l) reward_acc_m3.add_input(reward_blockm3) reward_acc_m3.add_alarm_connection(control_block_l) reward_blockm3.add_input(state_ph) reward_blockm2.add_input(state_ph) reward_blockm1.add_input(state_ph) reward_blockm0.add_input(state_ph) function_block0.add_input(state_ph) function_block1.add_input(mux_block) function_block1.add_input(state_ph) function_block2.add_input(function_block1) control_block_l.add_input(function_block1) control_block_l.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph, control_block_h
def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=pos_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([255, 255]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([500, 500])) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1) # Policy 2 pi2 = DeterministicControlPolicy(weights=np.array([0])) mu2 = np.zeros(pi2.weights_size) sigma2 = 1e-3 * np.ones(pi2.weights_size) distribution2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 1 learning_rate1 = params.get('learning_rate_high') lim = 1000 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=mdp.info.gamma, horizon=100) agent1 = alg_high(policy=pi1, mdp_info=mdp_info_agent1, learning_rate=learning_rate1, features=features) # Agent 2 learning_rate2 = params.get('learning_rate_low') mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = alg_low(distribution=distribution2, policy=pi2, mdp_info=mdp_info_agent2, learning_rate=learning_rate2) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=ep_per_run, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(distribution2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) control_block2.add_input(function_block1) control_block2.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=eval_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) low_level_dataset_eval += control_block2.dataset.get() # Save parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset_file', low_level_dataset_eval) np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1) np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, low_level_callbacks=[]): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') function_block1 = fBlock(name='pick position', phi=pick_position) hold_block = hold_state(name='holdstate') function_block2 = fBlock(name='compute setpoint', phi=compute_stepoint) function_block3 = fBlock(name='angle and distance', phi=polar_error) function_block4 = fBlock(name='reward low level', phi=reward_low_level) reward_acc = mean_reward_block(name='mean reward') control_block_h = ControlBlock(name='Control Block H', agent=agent_high, n_steps_per_fit=1) control_block_l = ControlBlock(name='Control Block L', agent=agent_low, n_eps_per_fit=ep_per_fit_low, callbacks=low_level_callbacks) blocks = [ state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l, function_block1, function_block2, function_block3, function_block4, reward_acc ] state_ph.add_input(control_block_l) reward_ph.add_input(control_block_l) lastaction_ph.add_input(control_block_l) function_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block_l) control_block_h.add_input(function_block1) control_block_h.add_reward(reward_acc) control_block_h.add_alarm_connection(control_block_l) hold_block.add_input(state_ph) hold_block.add_alarm_connection(control_block_l) function_block2.add_input(state_ph) function_block2.add_input(hold_block) function_block2.add_input(control_block_h) function_block3.add_input(state_ph) function_block3.add_input(function_block2) function_block4.add_input(function_block3) control_block_l.add_input(function_block3) control_block_l.add_reward(function_block4) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, ep_per_fit_high): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (pick distance to goal state var)', phi=pick_first_state) # Function Block 2 function_block2 = fBlock(name='f2 (build state)', phi=angle_to_angle_diff_complete_state) # Function Block 3 function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway) # Function Block 4 function_block4 = addBlock(name='f4 (add block)') # Function Block 5 function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward) # Control Block 1 parameter_callback1 = CollectDistributionParameter(agent_high.distribution) control_block_h = ControlBlock(name='Control Block High', agent=agent_high, n_eps_per_fit=ep_per_fit_high, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(agent_low.distribution) control_block_l = ControlBlock(name='Control Block Low', agent=agent_low, n_eps_per_fit=ep_per_fit_low, callbacks=[parameter_callback2]) control_block_h.set_mask() # Graph blocks = [ state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l, function_block1, function_block2, function_block3, function_block4, function_block5 ] state_ph.add_input(control_block_l) reward_ph.add_input(control_block_l) lastaction_ph.add_input(control_block_l) control_block_h.add_input(function_block1) control_block_h.add_reward(reward_ph) control_block_l.add_input(function_block2) control_block_l.add_reward(function_block4) function_block1.add_input(state_ph) function_block2.add_input(control_block_h) function_block2.add_input(state_ph) function_block3.add_input(function_block2) function_block5.add_input(state_ph) function_block4.add_input(function_block3) function_block4.add_input(function_block5) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph, control_block_h
def segway_experiment(alg_high, alg_low, params_high, params_low): np.random.seed() # Model Block mdp = SegwayLinearMotion(goal_distance=1.0) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (pick distance to goal state var)', phi=pick_first_state) # Function Block 2 function_block2 = fBlock(name='f2 (build state)', phi=angle_to_angle_diff_complete_state) # Function Block 3 function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway) # Function Block 4 function_block4 = addBlock(name='f4 (add block)') # Function Block 5 function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward) # Features approximator1 = Regressor(LinearApproximator, input_shape=(1,), output_shape=(1,)) # Policy H n_weights = approximator1.weights_size mu1 = np.zeros(n_weights) sigma1 = 2.0e-2*np.ones(n_weights) pi1 = DeterministicPolicy(approximator1) dist1 = GaussianDiagonalDistribution(mu1, sigma1) # Agent H lim = np.pi/2 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(-lim, lim, (1,)), gamma=mdp.info.gamma, horizon=mdp.info.horizon) agent_high = alg_high(dist1, pi1, mdp_info_agent1, **params_high) # Policy L approximator2 = Regressor(LinearApproximator, input_shape=(3,), output_shape=(1,)) n_weights2 = approximator2.weights_size mu2 = np.zeros(n_weights2) sigma2 = 2.0*np.ones(n_weights2) pi2 = DeterministicControlPolicy(approximator2) dist2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent Low mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( low=mdp.info.observation_space.low[1:], #FIXME FALSE high=mdp.info.observation_space.high[1:], #FIXME FALSE shape=(3,)), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=mdp.info.horizon) agent_low = alg_low(dist2, pi2, mdp_info_agent2, **params_low) # Control Block 1 parameter_callback1 = CollectDistributionParameter(dist1) control_block1 = ControlBlock(name='Control Block High', agent=agent_high, n_eps_per_fit=n_ep_per_fit*2, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(dist2) control_block2 = ControlBlock(name='Control Block Low', agent=agent_low, n_eps_per_fit=n_ep_per_fit, callbacks=[parameter_callback2]) control_block1.set_mask() # Algorithm blocks = [state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, function_block3, function_block4, function_block5] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(function_block1) control_block1.add_reward(reward_ph) control_block2.add_input(function_block2) control_block2.add_reward(function_block4) function_block1.add_input(state_ph) function_block2.add_input(control_block1) function_block2.add_input(state_ph) function_block3.add_input(function_block2) function_block5.add_input(state_ph) function_block4.add_input(function_block3) function_block4.add_input(function_block5) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) mask_done = False for n in range(n_epochs): print('ITERATION', n) if n == 2: control_block1.unset_mask() core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) print('dist H:', dist1.get_parameters()) print('dist L mu:', dist2.get_parameters()[:3]) print('dist L sigma:', dist2.get_parameters()[3:])
def experiment(): small = True print('ENV IS SMALL? ', small) np.random.seed() # Model Block mdp = ShipSteering(small=small, hard=True, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') #FeaturesH lim = 150 if small else 1000 tilingsH = Tiles.generate(n_tilings=1, n_tiles=[5, 5], low=[0, 0], high=[lim, lim]) featuresH = Features(tilings=tilingsH) # PolicyH epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000) piH = EpsGreedy(epsilon=epsilon) # AgentH learning_rate = Parameter(value=1) mdp_info_agentH = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([lim, lim]), shape=(2, )), action_space=spaces.Discrete(8), gamma=1, horizon=10000) approximator_paramsH = dict(input_shape=(featuresH.size, ), output_shape=mdp_info_agentH.action_space.size, n_actions=mdp_info_agentH.action_space.n) agentH = TrueOnlineSARSALambda(policy=piH, mdp_info=mdp_info_agentH, learning_rate=learning_rate, lambda_coeff=0.9, approximator_params=approximator_paramsH, features=featuresH) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agentH, n_steps_per_fit=1) #FeaturesL featuresL = Features(basis_list=[PolynomialBasis()]) # Policy1 input_shape = (featuresL.size, ) approximator_params = dict(input_dim=input_shape[0]) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) sigma = np.array([[1.3e-2]]) pi1 = GaussianPolicy(mu=approximator, sigma=sigma) # Policy2 pi2 = GaussianPolicy(mu=approximator, sigma=sigma) # Agent1 learning_rate1 = AdaptiveParameter(value=1e-5) agent1 = GPOMDP(pi1, mdp.info, learning_rate1, featuresL) # Agent2 learning_rate2 = AdaptiveParameter(value=1e-5) agent2 = GPOMDP(pi2, mdp.info, learning_rate2, featuresL) #Termination Conds termination_condition1 = TerminationCondition(active_dir=1, small=small) termination_condition2 = TerminationCondition(active_dir=5, small=small) # Control Block + control_block1 = ControlBlock(name='control block 1', agent=agent1, n_eps_per_fit=50, termination_condition=termination_condition1) # Control Block x control_block2 = ControlBlock(name='control block 2', agent=agent2, n_eps_per_fit=50, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate(small=small), name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low(small=small), name='f7 G_lo') #Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma, name='reward_acc_H') #Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block1]) mux_block.add_block_list([control_block2]) #Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, reward_acc_H ] #state_ph.add_input(mux_block) #reward_ph.add_input(mux_block) #lastaction_ph.add_input(mux_block) reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block1) reward_acc_H.add_alarm_connection(control_block2) control_blockH.add_input(function_block1) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block1) control_blockH.add_alarm_connection(control_block2) mux_block.add_input(control_blockH) mux_block.add_input(function_block2) control_block1.add_reward(function_block5) control_block2.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block1) function_block3.add_alarm_connection(control_block2) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(reward_ph) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train dataset_eval_visual = list() low_level_dataset_eval1 = list() low_level_dataset_eval2 = list() n_runs = 5 for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=1000, skip=True) dataset_eval = core.evaluate(n_episodes=10) last_ep_dataset = pick_last_ep(dataset_eval) dataset_eval_visual += last_ep_dataset low_level_dataset_eval1 += control_block1.dataset.get() low_level_dataset_eval2 += control_block2.dataset.get() # Visualize hi_lev_params = agentH.Q.get_weights() hi_lev_params = np.reshape(hi_lev_params, (8, 25)) max_q_val = np.zeros(shape=(25, )) act_max_q_val = np.zeros(shape=(25, )) for i in range(25): max_q_val[i] = np.amax(hi_lev_params[:, i]) act_max_q_val[i] = np.argmax(hi_lev_params[:, i]) max_q_val_tiled = np.reshape(max_q_val, (5, 5)) act_max_q_val_tiled = np.reshape(act_max_q_val, (5, 5)) #low_level_dataset1 = dataset_callback1.get() #low_level_dataset2 = dataset_callback2.get() subdir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/' mk_dir_recursive('./' + subdir) np.save(subdir + '/low_level_dataset1_file', low_level_dataset_eval1) np.save(subdir + '/low_level_dataset2_file', low_level_dataset_eval2) np.save(subdir + '/max_q_val_tiled_file', max_q_val_tiled) np.save(subdir + '/act_max_q_val_tiled_file', act_max_q_val_tiled) np.save(subdir + '/dataset_eval_file', dataset_eval_visual) return
def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = SegwayLinearMotion(goal_distance=1.0) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (pick distance to goal state var)', phi=pick_first_state) # Function Block 2 function_block2 = fBlock(name='f2 (build state)', phi=angle_to_angle_diff_complete_state) # Function Block 3 function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway) # Function Block 4 function_block4 = addBlock(name='f4 (add block)') # Integrator Block error_acc = ErrorAccumulatorBlock(name='error acc') # Features features1 = Features(basis_list=[PolynomialBasis()]) approximator1 = Regressor(LinearApproximator, input_shape=(features1.size, ), output_shape=(1, )) # Policy 1 n_weights = approximator1.weights_size mu1 = np.zeros(n_weights) sigma1 = 2e-0 * np.ones(n_weights) pi1 = DeterministicPolicy(approximator1) dist1 = GaussianDiagonalDistribution(mu1, sigma1) # Agent 1 eps1 = params.get('eps') lim = 2 * np.pi mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (1, )), gamma=mdp.info.gamma, horizon=20) agent1 = alg_low(distribution=dist1, policy=pi1, features=features1, mdp_info=mdp_info_agent1, eps=eps1) # Policy 2 basis = PolynomialBasis.generate(1, 3) features2 = Features(basis_list=basis) approximator2 = Regressor(LinearApproximator, input_shape=(features2.size, ), output_shape=(1, )) n_weights2 = approximator2.weights_size mu2 = np.zeros(n_weights2) sigma2 = 2e-0 * np.ones(n_weights2) pi2 = DeterministicPolicy(approximator2) dist2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 2 mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( low=np.array([-np.pi, -np.pi, -np.pi]), high=np.array([np.pi, np.pi, np.pi]), shape=(3, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=30) agent2 = alg_low(distribution=dist2, policy=pi2, features=features2, mdp_info=mdp_info_agent2, eps=eps1) # Control Block 1 parameter_callback1 = CollectDistributionParameter(dist1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=ep_per_run, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(dist2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=20, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, function_block3, function_block4, error_acc, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_input(function_block1) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) control_block2.add_input(function_block2) control_block2.add_reward(function_block4) function_block1.add_input(state_ph) function_block2.add_input(control_block1) function_block2.add_input(state_ph) function_block3.add_input(function_block2) error_acc.add_input(function_block3) error_acc.add_alarm_connection(control_block2) function_block4.add_input(function_block3) function_block4.add_input(error_acc) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, ep_per_fit_high): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=pos_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) # Control Block H control_block_h = ControlBlock(name='Control Block H', agent=agent_high, n_eps_per_fit=ep_per_fit_high) # Termination condition termination_condition = TerminationCondition(mdp._small) # Control Block 2 control_block_l = ControlBlock(name='Control Block L', agent=agent_low, n_eps_per_fit=ep_per_fit_low, termination_condition=termination_condition) # Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp.info.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l, function_block1, function_block2, reward_acc ] state_ph.add_input(control_block_l) reward_ph.add_input(control_block_l) lastaction_ph.add_input(control_block_l) control_block_h.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block_l) control_block_h.add_reward(reward_acc) control_block_h.add_alarm_connection(control_block_l) function_block1.add_input(control_block_h) function_block1.add_input(state_ph) function_block2.add_input(function_block1) #control_block_l.add_input(control_block_h) #control_block_l.add_input(state_ph) control_block_l.add_input(function_block1) control_block_l.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, ep_per_fit_high): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # FeaturesH low_hi = 0 lim_hi = mdp.field_size + 1e-8 n_tiles_high = [20, 20] # Discretization Block discretization_block = DiscretizationBlock(low=low_hi, high=lim_hi, n_tiles=n_tiles_high) # Function Block 0 function_block0 = fBlock(name='f0 (state build for high level)', phi=hi_lev_state) # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=angle_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) # Function Block 3 function_block3 = fBlock(name='f3 (compute pos ref)', phi=compute_pos_ref) #Function Block 4 function_block4 = fBlock(name='f4 (compute angle ref)', phi=generate_angle_ref) # Cotrol Block H control_block_h = ControlBlock(name='Control Block H', agent=agent_high, n_steps_per_fit=1) # Control Block L term_cond_low = TerminationConditionLow(small=mdp.small) control_block_l = ControlBlock(name='Control Block L', agent=agent_low, n_eps_per_fit=ep_per_fit_low, termination_condition=term_cond_low) # Reward Accumulators reward_acc = reward_accumulator_block(gamma=mdp.info.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block_h, reward_acc, control_block_l, discretization_block, function_block0, function_block1, function_block2, function_block3, function_block4 ] state_ph.add_input(control_block_l) reward_ph.add_input(control_block_l) lastaction_ph.add_input(control_block_l) discretization_block.add_input(function_block0) function_block4.add_input(control_block_h) control_block_h.add_input(discretization_block) control_block_h.add_reward(reward_acc) control_block_h.add_alarm_connection(control_block_l) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block_l) function_block0.add_input(state_ph) function_block1.add_input(function_block4) function_block1.add_input(function_block3) function_block1.add_input(state_ph) function_block2.add_input(function_block1) function_block3.add_input(function_block4) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block_l) control_block_l.add_input(function_block1) control_block_l.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph, control_block_h
def build_ghavamzadeh_graph(mdp, agent_plus, agent_cross, agent_high, ep_per_fit_low): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # FeaturesH low_hi = 0 lim_hi = 1000 + 1e-8 n_tiles_high = [20, 20] # Discretization Block discretization_block = DiscretizationBlock(low=low_hi, high=lim_hi, n_tiles=n_tiles_high) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agent_high, n_steps_per_fit=1) # Termination Conds termination_condition1 = TerminationCondition(active_dir='+') termination_condition2 = TerminationCondition(active_dir='x') # Control Block + control_block_plus = ControlBlock( name='control block 1', agent=agent_plus, n_eps_per_fit=ep_per_fit_low, termination_condition=termination_condition1) # Control Block x control_block_cross = ControlBlock( name='control block 2', agent=agent_cross, n_eps_per_fit=ep_per_fit_low, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate, name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low, name='f7 G_lo') # Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=1.0, name='reward_acc_H') # Selector Block function_block8 = fBlock(phi=selector_function, name='f7 G_lo') # Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block_plus]) mux_block.add_block_list([control_block_cross]) # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, function_block8, reward_acc_H, discretization_block ] reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block_plus) reward_acc_H.add_alarm_connection(control_block_cross) control_blockH.add_input(discretization_block) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block_plus) control_blockH.add_alarm_connection(control_block_cross) mux_block.add_input(function_block8) mux_block.add_input(function_block2) control_block_plus.add_reward(function_block5) control_block_cross.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block_plus) function_block3.add_alarm_connection(control_block_cross) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) function_block8.add_input(control_blockH) discretization_block.add_input(function_block1) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph, control_blockH
def build_computational_graph_discretized(mdp, agent, n_actions): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') control_block = ControlBlock(name='Control Block H', agent=agent, n_steps_per_fit=1) function_block = fBlock(name='Action converter', phi=ActionConverter(n_actions, mdp.info)) blocks = [ state_ph, reward_ph, lastaction_ph, control_block, function_block ] state_ph.add_input(function_block) reward_ph.add_input(function_block) lastaction_ph.add_input(function_block) control_block.add_input(state_ph) control_block.add_reward(reward_ph) function_block.add_input(control_block) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph
def build_computational_graph_baseline(mdp, agent, ep_per_fit, low_level_callbacks=[]): # State Placeholder state_ph = PlaceHolder(name='state_ph') # Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') function_block1 = fBlock(name='compute setpoint', phi=compute_stepoint) function_block2 = fBlock(name='angle and distance', phi=polar_error) function_block3 = fBlock(name='reward low level', phi=reward_low_level) control_block = ControlBlock(name='Control Block', agent=agent, n_eps_per_fit=ep_per_fit, callbacks=low_level_callbacks) blocks = [ state_ph, reward_ph, lastaction_ph, function_block1, function_block2, function_block3, control_block ] state_ph.add_input(control_block) reward_ph.add_input(control_block) lastaction_ph.add_input(control_block) function_block1.add_input(state_ph) function_block2.add_input(state_ph) function_block2.add_input(function_block1) function_block3.add_input(function_block2) control_block.add_input(function_block2) control_block.add_reward(function_block3) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) return computational_graph