Example #1
0
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # FeaturesH
    low_hi = 0
    lim_hi = 1000 + 1e-8
    n_tiles_high = [20, 20]
    n_tilings = 1

    # Discretization Block
    discretization_block = DiscretizationBlock(low=low_hi,
                                               high=lim_hi,
                                               n_tiles=n_tiles_high)

    # PolicyH
    epsilon = Parameter(value=0.1)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = params.get('learning_rate_high')

    mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(
        n_tiles_high[0] * n_tiles_high[1]),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)

    agentH = alg_high(policy=piH,
                      mdp_info=mdp_info_agentH,
                      learning_rate=learning_rate,
                      lambda_coeff=0.9)

    epsilon_update = EpsilonUpdate(piH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 10]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 3

    tilingsL = Tiles.generate(n_tilings=n_tilings,
                              n_tiles=n_tiles,
                              low=low,
                              high=high)

    featuresL = Features(tilings=tilingsL)

    mdp_info_agentL = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )),
                              action_space=mdp.info.action_space,
                              gamma=0.99,
                              horizon=10000)

    # Approximators
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)

    # Policy1
    std1 = np.array([3e-2])
    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1)

    # Policy2
    std2 = np.array([3e-2])
    pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2)

    # Agent1
    learning_rate1 = params.get('learning_rate_low')
    agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = params.get('learning_rate_low')
    agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir='+')
    termination_condition2 = TerminationCondition(active_dir='x')

    low_ep_per_fit = params.get('low_ep_per_fit')

    # Control Block +
    control_block_plus = ControlBlock(
        name='control block 1',
        agent=agent1,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition1)

    # Control Block x
    control_block_cross = ControlBlock(
        name='control block 2',
        agent=agent2,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate, name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low, name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    # Selector Block
    function_block8 = fBlock(phi=selector_function, name='f7 G_lo')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block_plus])
    mux_block.add_block_list([control_block_cross])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, function_block8,
        reward_acc_H, discretization_block
    ]

    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block_plus)
    reward_acc_H.add_alarm_connection(control_block_cross)

    control_blockH.add_input(discretization_block)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block_plus)
    control_blockH.add_alarm_connection(control_block_cross)

    mux_block.add_input(function_block8)
    mux_block.add_input(function_block2)

    control_block_plus.add_reward(function_block5)
    control_block_cross.add_reward(function_block5)

    function_block1.add_input(state_ph)

    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)

    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block_plus)
    function_block3.add_alarm_connection(control_block_cross)

    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)

    function_block5.add_input(function_block7)

    function_block6.add_input(reward_ph)

    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    function_block8.add_input(control_blockH)

    discretization_block.add_input(function_block1)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    # print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    dataset_eval += dataset_eval_run
    print('J at start : ' + str(np.mean(J)))
    for n in range(n_runs):
        print('ITERATION', n)

        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

        dataset_plus = control_block_plus.dataset.get()
        J_plus = compute_J(dataset_plus, mdp.info.gamma)
        dataset_cross = control_block_cross.dataset.get()
        J_cross = compute_J(dataset_cross, mdp.info.gamma)

        low_level_dataset_eval1.append(dataset_plus)
        low_level_dataset_eval2.append(dataset_cross)

        print('J ll PLUS at iteration  ' + str(n) + ': ' +
              str(np.mean(J_plus)))
        print('J ll CROSS at iteration ' + str(n) + ': ' +
              str(np.mean(J_cross)))
        if n == 4:
            control_blockH.callbacks = [epsilon_update]

    # Tile data
    hi_lev_params = agentH.Q.table
    max_q_val = np.zeros(n_tiles_high[0]**2)
    act_max_q_val = np.zeros(n_tiles_high[0]**2)
    for n in range(n_tiles_high[0]**2):
        max_q_val[n] = np.amax(hi_lev_params[n])
        act_max_q_val[n] = np.argmax(hi_lev_params[n])

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset1_file',
            low_level_dataset_eval1)
    np.save(subdir + str(i) + '/low_level_dataset2_file',
            low_level_dataset_eval2)
    np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val)
    np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)

    return
Example #2
0
def build_computational_graph(mdp, agent_low, agent_m0, agent_m1, agent_m2,
                              agent_m3, agent_high, ep_per_fit_low,
                              ep_per_fit_mid):

    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 0
    function_block0 = fBlock(name='f0 (state build for high level)',
                             phi=hi_lev_state)

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=pos_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    # External reward block
    reward_m0 = MidReward(gate_no=0)
    reward_m1 = MidReward(gate_no=1)
    reward_m2 = MidReward(gate_no=2)
    reward_m3 = MidReward(gate_no=3)
    reward_blockm0 = fBlock(name='rm1 (reward m1)', phi=reward_m0)
    reward_blockm1 = fBlock(name='rm2 (reward m2)', phi=reward_m1)
    reward_blockm2 = fBlock(name='rm3 (reward m3)', phi=reward_m2)
    reward_blockm3 = fBlock(name='rm4 (reward m4)', phi=reward_m3)

    # Control Block H
    control_block_h = ControlBlock(name='Control Block H',
                                   agent=agent_high,
                                   n_steps_per_fit=1)
    # Cotrol Block M1
    termination_condition_m1 = TerminationCondition(gate_no=0)
    control_block_m0 = ControlBlock(
        name='Control Block M0',
        agent=agent_m0,
        n_eps_per_fit=ep_per_fit_mid,
        termination_condition=termination_condition_m1)
    # Cotrol Block M2
    termination_condition_m2 = TerminationCondition(gate_no=1)
    control_block_m1 = ControlBlock(
        name='Control Block M1',
        agent=agent_m1,
        n_eps_per_fit=ep_per_fit_mid,
        termination_condition=termination_condition_m2)
    # Cotrol Block M3
    termination_condition_m3 = TerminationCondition(gate_no=2)
    control_block_m2 = ControlBlock(
        name='Control Block M2',
        agent=agent_m2,
        n_eps_per_fit=ep_per_fit_mid,
        termination_condition=termination_condition_m3)
    # Cotrol Block M4
    termination_condition_m4 = TerminationCondition(gate_no=3)
    control_block_m3 = ControlBlock(
        name='Control Block M3',
        agent=agent_m3,
        n_eps_per_fit=ep_per_fit_mid,
        termination_condition=termination_condition_m4)
    # Control Block L
    termination_condition_low = TerminationConditionLow(mdp.small)
    control_block_l = ControlBlock(
        name='Control Block L',
        agent=agent_low,
        n_eps_per_fit=ep_per_fit_low,
        termination_condition=termination_condition_low)
    # Selector Block
    mux_block = MuxBlock(name='Mux Block')
    mux_block.add_block_list([control_block_m0])
    mux_block.add_block_list([control_block_m1])
    mux_block.add_block_list([control_block_m2])
    mux_block.add_block_list([control_block_m3])

    # Reward Accumulators
    reward_acc = mean_reward_block(name='reward_acc_h')
    reward_acc_m0 = reward_accumulator_block(gamma=mdp.info.gamma,
                                             name='reward_acc_m0')
    reward_acc_m1 = reward_accumulator_block(gamma=mdp.info.gamma,
                                             name='reward_acc_m1')
    reward_acc_m2 = reward_accumulator_block(gamma=mdp.info.gamma,
                                             name='reward_acc_m2')
    reward_acc_m3 = reward_accumulator_block(gamma=mdp.info.gamma,
                                             name='reward_acc_m3')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block_h, reward_acc,
        control_block_l, function_block0, function_block1, function_block2,
        reward_blockm0, reward_blockm1, reward_blockm2, reward_blockm3,
        reward_acc_m0, reward_acc_m1, reward_acc_m2, reward_acc_m3, mux_block
    ]

    state_ph.add_input(control_block_l)
    reward_ph.add_input(control_block_l)
    lastaction_ph.add_input(control_block_l)

    control_block_h.add_input(function_block0)
    control_block_h.add_reward(reward_acc)
    control_block_h.add_alarm_connection(control_block_m0)
    control_block_h.add_alarm_connection(control_block_m1)
    control_block_h.add_alarm_connection(control_block_m2)
    control_block_h.add_alarm_connection(control_block_m3)

    mux_block.add_input(control_block_h)
    mux_block.add_input(state_ph)

    control_block_m0.add_reward(reward_acc_m0)
    control_block_m0.add_alarm_connection(control_block_l)

    control_block_m1.add_reward(reward_acc_m1)
    control_block_m1.add_alarm_connection(control_block_l)

    control_block_m2.add_reward(reward_acc_m2)
    control_block_m2.add_alarm_connection(control_block_l)

    control_block_m3.add_reward(reward_acc_m3)
    control_block_m3.add_alarm_connection(control_block_l)

    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block_m0)
    reward_acc.add_alarm_connection(control_block_m1)
    reward_acc.add_alarm_connection(control_block_m2)
    reward_acc.add_alarm_connection(control_block_m3)

    reward_acc_m0.add_input(reward_blockm0)
    reward_acc_m0.add_alarm_connection(control_block_l)

    reward_acc_m1.add_input(reward_blockm1)
    reward_acc_m1.add_alarm_connection(control_block_l)

    reward_acc_m2.add_input(reward_blockm2)
    reward_acc_m2.add_alarm_connection(control_block_l)

    reward_acc_m3.add_input(reward_blockm3)
    reward_acc_m3.add_alarm_connection(control_block_l)

    reward_blockm3.add_input(state_ph)
    reward_blockm2.add_input(state_ph)
    reward_blockm1.add_input(state_ph)
    reward_blockm0.add_input(state_ph)

    function_block0.add_input(state_ph)

    function_block1.add_input(mux_block)
    function_block1.add_input(state_ph)

    function_block2.add_input(function_block1)

    control_block_l.add_input(function_block1)
    control_block_l.add_reward(function_block2)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph, control_block_h
Example #3
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=pos_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([255, 255])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([500, 500]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1)

    # Policy 2
    pi2 = DeterministicControlPolicy(weights=np.array([0]))
    mu2 = np.zeros(pi2.weights_size)
    sigma2 = 1e-3 * np.ones(pi2.weights_size)
    distribution2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 1
    learning_rate1 = params.get('learning_rate_high')
    lim = 1000
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent1 = alg_high(policy=pi1,
                      mdp_info=mdp_info_agent1,
                      learning_rate=learning_rate1,
                      features=features)

    # Agent 2
    learning_rate2 = params.get('learning_rate_low')
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = alg_low(distribution=distribution2,
                     policy=pi2,
                     mdp_info=mdp_info_agent2,
                     learning_rate=learning_rate2)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(distribution2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=10,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)

    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block2)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        low_level_dataset_eval += control_block2.dataset.get()

    # Save
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset_file',
            low_level_dataset_eval)
    np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1)
    np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
def build_computational_graph(mdp,
                              agent_low,
                              agent_high,
                              ep_per_fit_low,
                              low_level_callbacks=[]):

    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    function_block1 = fBlock(name='pick position', phi=pick_position)

    hold_block = hold_state(name='holdstate')

    function_block2 = fBlock(name='compute setpoint', phi=compute_stepoint)

    function_block3 = fBlock(name='angle and distance', phi=polar_error)
    function_block4 = fBlock(name='reward low level', phi=reward_low_level)

    reward_acc = mean_reward_block(name='mean reward')

    control_block_h = ControlBlock(name='Control Block H',
                                   agent=agent_high,
                                   n_steps_per_fit=1)

    control_block_l = ControlBlock(name='Control Block L',
                                   agent=agent_low,
                                   n_eps_per_fit=ep_per_fit_low,
                                   callbacks=low_level_callbacks)

    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l,
        function_block1, function_block2, function_block3, function_block4,
        reward_acc
    ]

    state_ph.add_input(control_block_l)
    reward_ph.add_input(control_block_l)
    lastaction_ph.add_input(control_block_l)

    function_block1.add_input(state_ph)

    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block_l)

    control_block_h.add_input(function_block1)
    control_block_h.add_reward(reward_acc)
    control_block_h.add_alarm_connection(control_block_l)

    hold_block.add_input(state_ph)
    hold_block.add_alarm_connection(control_block_l)

    function_block2.add_input(state_ph)
    function_block2.add_input(hold_block)
    function_block2.add_input(control_block_h)

    function_block3.add_input(state_ph)
    function_block3.add_input(function_block2)

    function_block4.add_input(function_block3)

    control_block_l.add_input(function_block3)
    control_block_l.add_reward(function_block4)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph
Example #5
0
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low,
                              ep_per_fit_high):
    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Function Block 5
    function_block5 = fBlock(name='f5 (fall punish low level)',
                             phi=fall_reward)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(agent_high.distribution)
    control_block_h = ControlBlock(name='Control Block High',
                                   agent=agent_high,
                                   n_eps_per_fit=ep_per_fit_high,
                                   callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(agent_low.distribution)
    control_block_l = ControlBlock(name='Control Block Low',
                                   agent=agent_low,
                                   n_eps_per_fit=ep_per_fit_low,
                                   callbacks=[parameter_callback2])
    control_block_h.set_mask()

    # Graph
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l,
        function_block1, function_block2, function_block3, function_block4,
        function_block5
    ]

    state_ph.add_input(control_block_l)
    reward_ph.add_input(control_block_l)
    lastaction_ph.add_input(control_block_l)
    control_block_h.add_input(function_block1)
    control_block_h.add_reward(reward_ph)
    control_block_l.add_input(function_block2)
    control_block_l.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block_h)

    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    function_block5.add_input(state_ph)
    function_block4.add_input(function_block3)
    function_block4.add_input(function_block5)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph, control_block_h
Example #6
0
def segway_experiment(alg_high, alg_low, params_high, params_low):

    np.random.seed()

    # Model Block
    mdp = SegwayLinearMotion(goal_distance=1.0)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)',
                             phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Function Block 5
    function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward)


    # Features
    approximator1 = Regressor(LinearApproximator,
                             input_shape=(1,),
                             output_shape=(1,))

    # Policy H
    n_weights = approximator1.weights_size
    mu1 = np.zeros(n_weights)
    sigma1 = 2.0e-2*np.ones(n_weights)
    pi1 = DeterministicPolicy(approximator1)
    dist1 = GaussianDiagonalDistribution(mu1, sigma1)


    # Agent H
    lim = np.pi/2
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(-lim, lim, (1,)),
                              gamma=mdp.info.gamma,
                              horizon=mdp.info.horizon)
    agent_high = alg_high(dist1, pi1, mdp_info_agent1, **params_high)

    # Policy L
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(3,),
                              output_shape=(1,))
    n_weights2 = approximator2.weights_size
    mu2 = np.zeros(n_weights2)
    sigma2 = 2.0*np.ones(n_weights2)
    pi2 = DeterministicControlPolicy(approximator2)
    dist2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent Low
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        low=mdp.info.observation_space.low[1:], #FIXME FALSE
        high=mdp.info.observation_space.high[1:], #FIXME FALSE
        shape=(3,)),
        action_space=mdp.info.action_space,
        gamma=mdp.info.gamma, horizon=mdp.info.horizon)

    agent_low = alg_low(dist2, pi2, mdp_info_agent2, **params_low)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(dist1)
    control_block1 = ControlBlock(name='Control Block High', agent=agent_high,
                                  n_eps_per_fit=n_ep_per_fit*2,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(dist2)
    control_block2 = ControlBlock(name='Control Block Low', agent=agent_low,
                                  n_eps_per_fit=n_ep_per_fit,
                                  callbacks=[parameter_callback2])
    control_block1.set_mask()

    # Algorithm
    blocks = [state_ph, reward_ph, lastaction_ph, control_block1,
              control_block2, function_block1, function_block2,
              function_block3, function_block4, function_block5]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(function_block1)
    control_block1.add_reward(reward_ph)
    control_block2.add_input(function_block2)
    control_block2.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block1)

    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    function_block5.add_input(state_ph)
    function_block4.add_input(function_block3)
    function_block4.add_input(function_block5)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    mask_done = False
    for n in range(n_epochs):
        print('ITERATION', n)

        if n == 2:
            control_block1.unset_mask()
        core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        print('dist H:', dist1.get_parameters())
        print('dist L mu:', dist2.get_parameters()[:3])
        print('dist L sigma:', dist2.get_parameters()[3:])
Example #7
0
def experiment():

    small = True

    print('ENV IS SMALL? ', small)
    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=small, hard=True, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    #FeaturesH
    lim = 150 if small else 1000

    tilingsH = Tiles.generate(n_tilings=1,
                              n_tiles=[5, 5],
                              low=[0, 0],
                              high=[lim, lim])
    featuresH = Features(tilings=tilingsH)

    # PolicyH
    epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = Parameter(value=1)

    mdp_info_agentH = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([lim, lim]), shape=(2, )),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)
    approximator_paramsH = dict(input_shape=(featuresH.size, ),
                                output_shape=mdp_info_agentH.action_space.size,
                                n_actions=mdp_info_agentH.action_space.n)

    agentH = TrueOnlineSARSALambda(policy=piH,
                                   mdp_info=mdp_info_agentH,
                                   learning_rate=learning_rate,
                                   lambda_coeff=0.9,
                                   approximator_params=approximator_paramsH,
                                   features=featuresH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    featuresL = Features(basis_list=[PolynomialBasis()])

    # Policy1
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             **approximator_params)
    sigma = np.array([[1.3e-2]])
    pi1 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Policy2
    pi2 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Agent1
    learning_rate1 = AdaptiveParameter(value=1e-5)
    agent1 = GPOMDP(pi1, mdp.info, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = AdaptiveParameter(value=1e-5)
    agent2 = GPOMDP(pi2, mdp.info, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir=1, small=small)
    termination_condition2 = TerminationCondition(active_dir=5, small=small)

    # Control Block +
    control_block1 = ControlBlock(name='control block 1',
                                  agent=agent1,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition1)

    # Control Block x
    control_block2 = ControlBlock(name='control block 2',
                                  agent=agent2,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate(small=small), name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low(small=small), name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block1])
    mux_block.add_block_list([control_block2])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, reward_acc_H
    ]

    #state_ph.add_input(mux_block)
    #reward_ph.add_input(mux_block)
    #lastaction_ph.add_input(mux_block)
    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block1)
    reward_acc_H.add_alarm_connection(control_block2)
    control_blockH.add_input(function_block1)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block1)
    control_blockH.add_alarm_connection(control_block2)
    mux_block.add_input(control_blockH)
    mux_block.add_input(function_block2)
    control_block1.add_reward(function_block5)
    control_block2.add_reward(function_block5)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)
    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block1)
    function_block3.add_alarm_connection(control_block2)
    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)
    function_block5.add_input(reward_ph)
    function_block5.add_input(function_block7)
    function_block6.add_input(reward_ph)
    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    dataset_eval_visual = list()
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()

    n_runs = 5
    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=1000, skip=True)
        dataset_eval = core.evaluate(n_episodes=10)
        last_ep_dataset = pick_last_ep(dataset_eval)
        dataset_eval_visual += last_ep_dataset
        low_level_dataset_eval1 += control_block1.dataset.get()
        low_level_dataset_eval2 += control_block2.dataset.get()

    # Visualize
    hi_lev_params = agentH.Q.get_weights()
    hi_lev_params = np.reshape(hi_lev_params, (8, 25))
    max_q_val = np.zeros(shape=(25, ))
    act_max_q_val = np.zeros(shape=(25, ))
    for i in range(25):
        max_q_val[i] = np.amax(hi_lev_params[:, i])
        act_max_q_val[i] = np.argmax(hi_lev_params[:, i])
    max_q_val_tiled = np.reshape(max_q_val, (5, 5))
    act_max_q_val_tiled = np.reshape(act_max_q_val, (5, 5))
    #low_level_dataset1 = dataset_callback1.get()
    #low_level_dataset2 = dataset_callback2.get()

    subdir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/'
    mk_dir_recursive('./' + subdir)

    np.save(subdir + '/low_level_dataset1_file', low_level_dataset_eval1)
    np.save(subdir + '/low_level_dataset2_file', low_level_dataset_eval2)
    np.save(subdir + '/max_q_val_tiled_file', max_q_val_tiled)
    np.save(subdir + '/act_max_q_val_tiled_file', act_max_q_val_tiled)
    np.save(subdir + '/dataset_eval_file', dataset_eval_visual)

    return
Example #8
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = SegwayLinearMotion(goal_distance=1.0)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Integrator Block
    error_acc = ErrorAccumulatorBlock(name='error acc')

    # Features
    features1 = Features(basis_list=[PolynomialBasis()])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features1.size, ),
                              output_shape=(1, ))

    # Policy 1
    n_weights = approximator1.weights_size
    mu1 = np.zeros(n_weights)
    sigma1 = 2e-0 * np.ones(n_weights)
    pi1 = DeterministicPolicy(approximator1)
    dist1 = GaussianDiagonalDistribution(mu1, sigma1)

    # Agent 1
    eps1 = params.get('eps')
    lim = 2 * np.pi
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (1, )),
                              gamma=mdp.info.gamma,
                              horizon=20)

    agent1 = alg_low(distribution=dist1,
                     policy=pi1,
                     features=features1,
                     mdp_info=mdp_info_agent1,
                     eps=eps1)

    # Policy 2
    basis = PolynomialBasis.generate(1, 3)
    features2 = Features(basis_list=basis)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(features2.size, ),
                              output_shape=(1, ))
    n_weights2 = approximator2.weights_size
    mu2 = np.zeros(n_weights2)
    sigma2 = 2e-0 * np.ones(n_weights2)
    pi2 = DeterministicPolicy(approximator2)
    dist2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 2
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        low=np.array([-np.pi, -np.pi, -np.pi]),
        high=np.array([np.pi, np.pi, np.pi]),
        shape=(3, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=30)

    agent2 = alg_low(distribution=dist2,
                     policy=pi2,
                     features=features2,
                     mdp_info=mdp_info_agent2,
                     eps=eps1)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(dist1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(dist2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=20,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, function_block3, function_block4,
        error_acc, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_input(function_block1)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    control_block2.add_input(function_block2)
    control_block2.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block1)
    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    error_acc.add_input(function_block3)
    error_acc.add_alarm_connection(control_block2)
    function_block4.add_input(function_block3)
    function_block4.add_input(error_acc)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low,
                              ep_per_fit_high):
    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=pos_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    # Control Block H
    control_block_h = ControlBlock(name='Control Block H',
                                   agent=agent_high,
                                   n_eps_per_fit=ep_per_fit_high)

    # Termination condition
    termination_condition = TerminationCondition(mdp._small)
    # Control Block 2
    control_block_l = ControlBlock(name='Control Block L',
                                   agent=agent_low,
                                   n_eps_per_fit=ep_per_fit_low,
                                   termination_condition=termination_condition)

    # Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp.info.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l,
        function_block1, function_block2, reward_acc
    ]

    state_ph.add_input(control_block_l)
    reward_ph.add_input(control_block_l)
    lastaction_ph.add_input(control_block_l)
    control_block_h.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block_l)
    control_block_h.add_reward(reward_acc)
    control_block_h.add_alarm_connection(control_block_l)
    function_block1.add_input(control_block_h)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)

    #control_block_l.add_input(control_block_h)
    #control_block_l.add_input(state_ph)
    control_block_l.add_input(function_block1)
    control_block_l.add_reward(function_block2)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low,
                              ep_per_fit_high):

    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # FeaturesH
    low_hi = 0
    lim_hi = mdp.field_size + 1e-8
    n_tiles_high = [20, 20]

    # Discretization Block
    discretization_block = DiscretizationBlock(low=low_hi,
                                               high=lim_hi,
                                               n_tiles=n_tiles_high)

    # Function Block 0
    function_block0 = fBlock(name='f0 (state build for high level)',
                             phi=hi_lev_state)

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=angle_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    # Function Block 3
    function_block3 = fBlock(name='f3 (compute pos ref)', phi=compute_pos_ref)

    #Function Block 4
    function_block4 = fBlock(name='f4 (compute angle ref)',
                             phi=generate_angle_ref)

    # Cotrol Block H
    control_block_h = ControlBlock(name='Control Block H',
                                   agent=agent_high,
                                   n_steps_per_fit=1)
    # Control Block L
    term_cond_low = TerminationConditionLow(small=mdp.small)
    control_block_l = ControlBlock(name='Control Block L',
                                   agent=agent_low,
                                   n_eps_per_fit=ep_per_fit_low,
                                   termination_condition=term_cond_low)

    # Reward Accumulators
    reward_acc = reward_accumulator_block(gamma=mdp.info.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block_h, reward_acc,
        control_block_l, discretization_block, function_block0,
        function_block1, function_block2, function_block3, function_block4
    ]

    state_ph.add_input(control_block_l)
    reward_ph.add_input(control_block_l)
    lastaction_ph.add_input(control_block_l)

    discretization_block.add_input(function_block0)

    function_block4.add_input(control_block_h)

    control_block_h.add_input(discretization_block)
    control_block_h.add_reward(reward_acc)
    control_block_h.add_alarm_connection(control_block_l)

    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block_l)

    function_block0.add_input(state_ph)

    function_block1.add_input(function_block4)
    function_block1.add_input(function_block3)
    function_block1.add_input(state_ph)

    function_block2.add_input(function_block1)

    function_block3.add_input(function_block4)
    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block_l)

    control_block_l.add_input(function_block1)
    control_block_l.add_reward(function_block2)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph, control_block_h
def build_ghavamzadeh_graph(mdp, agent_plus, agent_cross, agent_high,
                            ep_per_fit_low):
    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # FeaturesH
    low_hi = 0
    lim_hi = 1000 + 1e-8
    n_tiles_high = [20, 20]

    # Discretization Block
    discretization_block = DiscretizationBlock(low=low_hi,
                                               high=lim_hi,
                                               n_tiles=n_tiles_high)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agent_high,
                                  n_steps_per_fit=1)

    # Termination Conds
    termination_condition1 = TerminationCondition(active_dir='+')
    termination_condition2 = TerminationCondition(active_dir='x')

    # Control Block +
    control_block_plus = ControlBlock(
        name='control block 1',
        agent=agent_plus,
        n_eps_per_fit=ep_per_fit_low,
        termination_condition=termination_condition1)

    # Control Block x
    control_block_cross = ControlBlock(
        name='control block 2',
        agent=agent_cross,
        n_eps_per_fit=ep_per_fit_low,
        termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate, name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low, name='f7 G_lo')

    # Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=1.0, name='reward_acc_H')

    # Selector Block
    function_block8 = fBlock(phi=selector_function, name='f7 G_lo')

    # Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block_plus])
    mux_block.add_block_list([control_block_cross])

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, function_block8,
        reward_acc_H, discretization_block
    ]

    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block_plus)
    reward_acc_H.add_alarm_connection(control_block_cross)

    control_blockH.add_input(discretization_block)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block_plus)
    control_blockH.add_alarm_connection(control_block_cross)

    mux_block.add_input(function_block8)
    mux_block.add_input(function_block2)

    control_block_plus.add_reward(function_block5)
    control_block_cross.add_reward(function_block5)

    function_block1.add_input(state_ph)

    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)

    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block_plus)
    function_block3.add_alarm_connection(control_block_cross)

    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)

    function_block5.add_input(function_block7)

    function_block6.add_input(reward_ph)

    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    function_block8.add_input(control_blockH)

    discretization_block.add_input(function_block1)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph, control_blockH
Example #12
0
def build_computational_graph_discretized(mdp, agent, n_actions):

    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    control_block = ControlBlock(name='Control Block H',
                                 agent=agent,
                                 n_steps_per_fit=1)

    function_block = fBlock(name='Action converter',
                            phi=ActionConverter(n_actions, mdp.info))

    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block, function_block
    ]

    state_ph.add_input(function_block)
    reward_ph.add_input(function_block)
    lastaction_ph.add_input(function_block)

    control_block.add_input(state_ph)
    control_block.add_reward(reward_ph)

    function_block.add_input(control_block)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph
def build_computational_graph_baseline(mdp,
                                       agent,
                                       ep_per_fit,
                                       low_level_callbacks=[]):

    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    function_block1 = fBlock(name='compute setpoint', phi=compute_stepoint)

    function_block2 = fBlock(name='angle and distance', phi=polar_error)
    function_block3 = fBlock(name='reward low level', phi=reward_low_level)

    control_block = ControlBlock(name='Control Block',
                                 agent=agent,
                                 n_eps_per_fit=ep_per_fit,
                                 callbacks=low_level_callbacks)

    blocks = [
        state_ph, reward_ph, lastaction_ph, function_block1, function_block2,
        function_block3, control_block
    ]

    state_ph.add_input(control_block)
    reward_ph.add_input(control_block)
    lastaction_ph.add_input(control_block)

    function_block1.add_input(state_ph)

    function_block2.add_input(state_ph)
    function_block2.add_input(function_block1)

    function_block3.add_input(function_block2)

    control_block.add_input(function_block2)
    control_block.add_reward(function_block3)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph