Beispiel #1
0
    def __init__(self, A, B, Q, R, random_init=False, gamma=0.9, horizon=50):
        """
        Constructor.

            Args:
                A (np.ndarray): the state dynamics matrix;
                B (np.ndarray): the action dynamics matrix;
                Q (np.ndarray): reward weight matrix for state;
                R (np.ndarray): reward weight matrix for action;
                random_init (bool, False): start from a random state;
                gamma (float, 0.9): discount factor;
                horizon (int, 50): horizon of the mdp.

        """
        self.A = A
        self.B = B
        self.Q = Q
        self.R = R
        self.random_init = random_init

        # MDP properties
        high_x = np.inf * np.ones(A.shape[0])
        low_x = -high_x

        high_u = np.inf * np.ones(B.shape[0])
        low_u = -high_u

        observation_space = spaces.Box(low=low_x, high=high_x)
        action_space = spaces.Box(low=low_u, high=high_u)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(LQR, self).__init__(mdp_info)
Beispiel #2
0
    def __init__(self, small=True):
        self.__name__ = 'ShipSteering'

        # MDP parameters
        self.field_size = 150 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12.])
        high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2
        self._gate_s = np.empty(2)
        self._gate_e = np.empty(2)
        self._gate_s[0] = 100 if small else 900
        self._gate_s[1] = 120 if small else 920
        self._gate_e[0] = 120 if small else 920
        self._gate_e[1] = 100 if small else 900

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(ShipSteering, self).__init__(mdp_info)
Beispiel #3
0
    def __init__(self, small=True, hard=False):
        """
        Constructor.

        Args:
             small (bool, True): whether to use a small state space or not.
             hard (bool, False): whether to use -100 as reward for going
                                 outside or -10000. With -100 reward the
                                 environment is considerably harder.

        """
        self.__name__ = 'ShipSteeringStraight'

        # MDP parameters
        self.field_size = 150
        low = np.array([0, 0, -np.pi, -np.pi / 12.])
        high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2
        self.goal_pos = np.array([140, 75])
        self._out_reward = -100
        self._success_reward = 100

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(ShipSteeringStraight, self).__init__(mdp_info)
Beispiel #4
0
    def __init__(self, n_steps_action=3, viz_speed=100, small=False):

        self.__name__ = 'ShipSteeringMultiGate'
        self.n_steps_action = n_steps_action
        self.viz_speed = viz_speed

        # MDP parameters
        self.no_of_gates = 4

        self.small = small

        self.field_size = 500 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12., 0])
        high = np.array([
            self.field_size, self.field_size, np.pi, np.pi / 12.,
            self.no_of_gates
        ])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2

        gate_1s = np.array([75, 175]) if small else np.array([150, 350])
        gate_1e = np.array([125, 175]) if small else np.array([250, 350])

        gate_1 = np.array([gate_1s, gate_1e])

        gate_2s = np.array([150, 300]) if small else np.array([300, 600])
        gate_2e = np.array([200, 300]) if small else np.array([400, 600])

        gate_2 = np.array([gate_2s, gate_2e])

        gate_3s = np.array([250, 350]) if small else np.array([500, 700])
        gate_3e = np.array([300, 350]) if small else np.array([600, 700])

        gate_3 = np.array([gate_3s, gate_3e])

        gate_4s = np.array([150, 425]) if small else np.array([300, 850])
        gate_4e = np.array([200, 425]) if small else np.array([400, 850])

        gate_4 = np.array([gate_4s, gate_4e])

        self._gate_list = gate_1, gate_2, gate_3, gate_4

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        self._out_reward = -10000
        self.correct_order = False

        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(self.field_size,
                              self.field_size,
                              background=(66, 131, 237))

        super(ShipSteeringMultiGate, self).__init__(mdp_info)
Beispiel #5
0
    def __init__(self, **kwargs):

        # Define environment properties
        high_x = np.array([5.0, 5.0, np.pi])
        low_x = -high_x

        high_u = np.array([1.0, 3.0])
        low_u = -high_u

        observation_space = spaces.Box(low=low_x, high=high_x)
        action_space = spaces.Box(low=low_u, high=high_u)

        gamma = 0.9
        horizon = 400

        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        hz = 10.0

        super(TurtlebotGazebo, self).__init__('turtlebot_gazebo', mdp_info, hz, **kwargs)

        # subscribe to /cmd_vel topic to publish the setpoint
        self._pub = rospy.Publisher('/cmd_vel', Twist, queue_size=1)

        # subscribe to /gazebo/model_states to get the position of the turtlebot
        model_state_service_name = '/gazebo/get_model_state'
        rospy.wait_for_service(model_state_service_name)
        self._model_state_service = rospy.ServiceProxy(model_state_service_name, GetModelState)
Beispiel #6
0
    def __init__(self,
                 A,
                 B,
                 Q,
                 R,
                 max_pos=np.inf,
                 max_action=np.inf,
                 random_init=False,
                 episodic=False,
                 gamma=0.9,
                 horizon=50):
        """
        Constructor.

            Args:
                A (np.ndarray): the state dynamics matrix;
                B (np.ndarray): the action dynamics matrix;
                Q (np.ndarray): reward weight matrix for state;
                R (np.ndarray): reward weight matrix for action;
                max_pos (float, np.inf): maximum value of the state;
                max_action (float, np.inf): maximum value of the action;
                random_init (bool, False): start from a random state;
                episodic (bool, False): end the episode when the state goes over
                the threshold;
                gamma (float, 0.9): discount factor;
                horizon (int, 50): horizon of the mdp.

        """
        self.A = A
        self.B = B
        self.Q = Q
        self.R = R
        self._max_pos = max_pos
        self._max_action = max_action
        self._episodic = episodic
        self.random_init = random_init

        # MDP properties
        high_x = self._max_pos * np.ones(A.shape[0])
        low_x = -high_x

        high_u = self._max_action * np.ones(B.shape[0])
        low_u = -high_u

        observation_space = spaces.Box(low=low_x, high=high_x)
        action_space = spaces.Box(low=low_u, high=high_u)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super().__init__(mdp_info)
Beispiel #7
0
    def __init__(self, grid_map_file, height_window=84, width_window=84):
        self.__name__ = 'GridWorldPixelGenerator'

        self.window_size = (width_window, height_window)

        self._symbols = {
            '.': 0.,
            'S': 63.75,
            '*': 127.5,
            '#': 191.25,
            'G': 255.
        }

        self._grid, start, goal = self._generate(grid_map_file)
        self._initial_grid = deepcopy(self._grid)
        height = self._grid.shape[0]
        width = self._grid.shape[1]

        assert height_window % height == 0 and width_window % width == 0

        # MDP properties
        observation_space = spaces.Box(low=0.,
                                       high=255.,
                                       shape=(self.window_size[1],
                                              self.window_size[0]))
        action_space = spaces.Discrete(5)
        horizon = 100
        gamma = .9
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(GridWorldPixelGenerator, self).__init__(mdp_info, height, width,
                                                      start, goal)
def build_low_level_ghavamzadeh(alg, params, mdp):
    # FeaturesL
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 10]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 3

    tilingsL = Tiles.generate(n_tilings=n_tilings,
                              n_tiles=n_tiles,
                              low=low,
                              high=high)

    featuresL = Features(tilings=tilingsL)

    mdp_info_agentL = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )),
                              action_space=mdp.info.action_space,
                              gamma=0.99,
                              horizon=10000)

    input_shape = (featuresL.size, )
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    std = np.array([3e-2])
    pi = DiagonalGaussianPolicy(mu=approximator, std=std)

    agent = alg(pi, mdp_info_agentL, features=featuresL, **params)

    return agent
Beispiel #9
0
def build_discretized_agent(alg, params, n, optim, loss, mdp, eps, n_features,
                            use_cuda):
    high = mdp.info.observation_space.high
    low = mdp.info.observation_space.low

    observation_space = spaces.Box(low=low, high=high)
    action_space = spaces.Discrete(n)

    mdp_info = MDPInfo(observation_space=observation_space,
                       action_space=action_space,
                       gamma=mdp.info.gamma,
                       horizon=mdp.info.horizon)

    pi = Boltzmann(eps)

    approximator_params = dict(network=Network,
                               optimizer=optim,
                               loss=loss,
                               n_features=n_features,
                               input_shape=mdp_info.observation_space.shape,
                               output_shape=mdp_info.action_space.size,
                               n_actions=mdp_info.action_space.n,
                               use_cuda=use_cuda)

    agent = alg(PyTorchApproximator,
                pi,
                mdp_info,
                approximator_params=approximator_params,
                **params)

    return agent
Beispiel #10
0
    def __init__(self,
                 random_start=False,
                 m=1.,
                 l=1.,
                 g=9.8,
                 mu=1e-2,
                 max_u=5.,
                 horizon=5000,
                 gamma=.99):
        """
        Constructor.

        Args:
            random_start (bool, False): whether to start from a random position
                or from the horizontal one;
            m (float, 1.0): mass of the pendulum;
            l (float, 1.0): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 5.0): maximum allowed input torque;
            horizon (int, 5000): horizon of the problem;
            gamma (int, .99): discount factor.

        """
        # MDP parameters
        self._m = m
        self._l = l
        self._g = g
        self._mu = mu
        self._random = random_start
        self._dt = .01
        self._max_u = max_u
        self._max_omega = 5 / 2 * np.pi
        high = np.array([np.pi, self._max_omega])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-max_u]),
                                  high=np.array([max_u]))
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None

        super().__init__(mdp_info)
Beispiel #11
0
def build_high_level_agent(alg, params, mdp, mu, std):
    tilings = Tiles.generate(n_tilings=1,
                             n_tiles=[10, 10],
                             low=mdp.info.observation_space.low[:2],
                             high=mdp.info.observation_space.high[:2])
    features = Features(tilings=tilings)

    input_shape = (features.size, )

    mu_approximator = Regressor(LinearApproximator,
                                input_shape=input_shape,
                                output_shape=(1, ))
    std_approximator = Regressor(LinearApproximator,
                                 input_shape=input_shape,
                                 output_shape=(1, ))

    w_mu = mu * np.ones(mu_approximator.weights_size)
    mu_approximator.set_weights(w_mu)

    w_std = std * np.ones(std_approximator.weights_size)
    mu_approximator.set_weights(w_std)

    pi = StateLogStdGaussianPolicy(mu=mu_approximator,
                                   log_std=std_approximator)

    obs_low = np.array(
        [mdp.info.observation_space.low[0], mdp.info.observation_space.low[1]])
    obs_high = np.array([
        mdp.info.observation_space.high[0], mdp.info.observation_space.high[1]
    ])
    mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(obs_low,
                                                           obs_high,
                                                           shape=(2, )),
                              action_space=spaces.Box(
                                  mdp.info.observation_space.low[2],
                                  mdp.info.observation_space.high[2],
                                  shape=(1, )),
                              gamma=1,
                              horizon=10)
    agent = alg(policy=pi,
                mdp_info=mdp_info_agent1,
                features=features,
                **params)

    return agent
Beispiel #12
0
    def __init__(self, random_start=False, goal_distance=1.0):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.99

        self.Mr = 0.3 * 2
        self.Mp = 2.55
        self.Ip = 2.6e-2
        self.Ir = 4.54e-4 * 2
        self.l = 13.8e-2
        self.r = 5.5e-2
        self.dt = 1e-2
        self.g = 9.81
        self.max_u = 5

        self._random = random_start
        self._goal_distance = goal_distance

        high = np.array([2 * self._goal_distance, np.pi, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self.max_u]),
                                  high=np.array([self.max_u]))
        horizon = 1500
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        env_width = 4 * goal_distance
        env_height = 2.5 * 2 * self.l
        width = 800
        height = int(width * env_height / env_width)

        self._viewer = Viewer(env_width, env_height, width, height)

        super(SegwayLinearMotion, self).__init__(mdp_info)
Beispiel #13
0
    def __init__(self, small=True, n_steps_action=3):
        """
        Constructor.

        Args:
             small (bool, True): whether to use a small state space or not.
             n_steps_action (int, 3): number of integration intervals for each
                                      step of the mdp.

        """
        # MDP parameters
        self.field_size = 150 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12.])
        high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2
        self._gate_s = np.empty(2)
        self._gate_e = np.empty(2)
        self._gate_s[0] = 100 if small else 350
        self._gate_s[1] = 120 if small else 400
        self._gate_e[0] = 120 if small else 450
        self._gate_e[1] = 100 if small else 400
        self._out_reward = -100
        self._success_reward = 0
        self._small = small
        self._state = None
        self.n_steps_action = n_steps_action

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(self.field_size,
                              self.field_size,
                              background=(66, 131, 237))

        super(ShipSteering, self).__init__(mdp_info)
Beispiel #14
0
def build_agent_high(alg, params, std, mdp):
    # Features
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(1, ),
                              output_shape=(1, ))

    # Policy H
    n_weights = approximator1.weights_size
    mu = np.zeros(n_weights)
    sigma = std * np.ones(n_weights)
    pi = DeterministicPolicy(approximator1)
    dist = GaussianDiagonalDistribution(mu, sigma)

    lim = np.pi / 2
    low = mdp.info.observation_space.low[0:1]
    high = mdp.info.observation_space.high[0:1]
    mdp_info = MDPInfo(observation_space=spaces.Box(low, high),
                       action_space=spaces.Box(-lim, lim, (1, )),
                       gamma=mdp.info.gamma,
                       horizon=mdp.info.horizon)
    return alg(dist, pi, mdp_info, **params)
Beispiel #15
0
    def __init__(self,
                 random_start=False,
                 m=1.0,
                 l=1.0,
                 g=9.8,
                 mu=1e-2,
                 max_u=2.0):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one
            m (float, 1.0): Mass of the pendulum
            l (float, 1.0): Length of the pendulum
            g (float, 9.8): gravity acceleration constant
            mu (float, 1e-2): friction constant of the pendulum
            max_u (float, 2.0): maximum allowed input torque

        """
        # MDP parameters
        self._g = g
        self._m = m
        self._l = l
        self._mu = mu
        self._random = random_start
        self._dt = 0.02
        self._max_u = max_u
        self._max_omega = 78.54
        high = np.array([np.pi, self._max_omega])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-max_u]),
                                  high=np.array([max_u]))
        horizon = 5000
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(InvertedPendulum, self).__init__(mdp_info)
Beispiel #16
0
    def __init__(self, random_start=False):
        """
        Constructor.

        Args:
            random_start: whether to start from a random position or from the
                          horizontal one

        """
        # MDP parameters

        gamma = 0.97

        self._Mr = 0.3 * 2
        self._Mp = 2.55
        self._Ip = 2.6e-2
        self._Ir = 4.54e-4 * 2
        self._l = 13.8e-2
        self._r = 5.5e-2
        self._dt = 1e-2
        self._g = 9.81
        self._max_u = 5

        self._random = random_start

        high = np.array([-np.pi / 2, 15, 75])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Box(low=np.array([-self._max_u]),
                                  high=np.array([self._max_u]))
        horizon = 300
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(5 * self._l, 5 * self._l)
        self._last_x = 0

        super(Segway, self).__init__(mdp_info)
Beispiel #17
0
def build_mid_level_agent(alg, params, mdp, mu, std):
    mu_approximator = Regressor(LinearApproximator,
                                input_shape=(1, ),
                                output_shape=(2, ))

    w_mu = mu * np.ones(mu_approximator.weights_size)
    mu_approximator.set_weights(w_mu)

    pi = DiagonalGaussianPolicy(mu=mu_approximator, std=std * np.ones(2))

    lim = mdp.info.observation_space.high[0]
    basis = PolynomialBasis()
    features = BasisFeatures(basis=[basis])
    mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(0, 1, (1, )),
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=1,
                              horizon=10)
    agent = alg(policy=pi,
                mdp_info=mdp_info_agent1,
                features=features,
                **params)

    return agent
Beispiel #18
0
    def __init__(self,
                 m=2.,
                 M=8.,
                 l=.5,
                 g=9.8,
                 mu=1e-2,
                 max_u=50.,
                 noise_u=10.,
                 horizon=3000,
                 gamma=.95):
        """
        Constructor.

        Args:
            m (float, 2.0): mass of the pendulum;
            M (float, 8.0): mass of the cart;
            l (float, .5): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 50.): maximum allowed input torque;
            noise_u (float, 10.): maximum noise on the action;
            horizon (int, 3000): horizon of the problem;
            gamma (int, .95): discount factor.

        """
        # MDP parameters
        self._m = m
        self._M = M
        self._l = l
        self._g = g
        self._alpha = 1 / (self._m + self._M)
        self._mu = mu
        self._dt = .1
        self._max_u = max_u
        self._noise_u = noise_u
        high = np.array([np.inf, np.inf])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(3)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None
        self._state = None

        super().__init__(mdp_info)
def build_high_level_agent(alg, params, mdp, mu, sigma):
    features = Features(basis_list=[PolynomialBasis()])
    approximator = Regressor(LinearApproximator,
                             input_shape=(features.size, ),
                             output_shape=(2, ))
    approximator.set_weights(mu)

    pi1 = DiagonalGaussianPolicy(mu=approximator, std=sigma)

    lim = mdp.info.observation_space.high[0]
    mdp_info_agent = MDPInfo(observation_space=mdp.info.observation_space,
                             action_space=spaces.Box(0, lim, (2, )),
                             gamma=1.0,
                             horizon=100)
    agent = alg(pi1, mdp_info_agent, features=features, **params)

    return agent
Beispiel #20
0
def build_low_level_agent(alg, params, mdp):
    features = Features(
        basis_list=[PolynomialBasis(dimensions=[0], degrees=[1])])

    pi = DeterministicControlPolicy(weights=np.array([0]))
    mu = np.zeros(pi.weights_size)
    sigma = 1e-3 * np.ones(pi.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent = alg(distribution, pi, mdp_info_agent2, features=features, **params)

    return agent
Beispiel #21
0
    def __init__(self, horizon=100, gamma=.95):
        """
        Constructor.

        """
        # MDP parameters
        self.max_pos = 1.
        self.max_velocity = 3.
        high = np.array([self.max_pos, self.max_velocity])
        self._g = 9.81
        self._m = 1.
        self._dt = .1
        self._discrete_actions = [-4., 4.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(2)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super().__init__(mdp_info)
Beispiel #22
0
    def __init__(self):
        self.__name__ = 'CarOnHill'

        # MDP parameters
        self.max_pos = 1.
        self.max_velocity = 3.
        high = np.array([self.max_pos, self.max_velocity])
        self._g = 9.81
        self._m = 1
        self._dt = .1
        self._discrete_actions = [-4., 4.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(2)
        horizon = 100
        gamma = .95
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(CarOnHill, self).__init__(mdp_info)
Beispiel #23
0
def build_agent_low(alg, params, std, mdp):
    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             output_shape=(1, ))
    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = std * np.ones(n_weights)
    pi = DeterministicControlPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    # Agent Low
    mdp_info = MDPInfo(
        observation_space=spaces.Box(
            low=mdp.info.observation_space.low[1:],  # FIXME FALSE
            high=mdp.info.observation_space.high[1:],  # FIXME FALSE
        ),
        action_space=mdp.info.action_space,
        gamma=mdp.info.gamma,
        horizon=mdp.info.horizon)

    return alg(dist, pi, mdp_info, **params)
def build_low_level_agent(alg, params, mdp, horizon, std):
    rho_max = np.linalg.norm(mdp.info.observation_space.high[:2] -
                             mdp.info.observation_space.low[:2])
    low = np.array([-np.pi, 0])
    high = np.array([np.pi, rho_max])

    basis = FourierBasis.generate(low, high, 10)
    features = Features(basis_list=basis)

    approximator = Regressor(LinearApproximator,
                             input_shape=(features.size, ),
                             output_shape=mdp.info.action_space.shape)

    pi = DiagonalGaussianPolicy(approximator, std)

    mdp_info_agent = MDPInfo(observation_space=spaces.Box(low, high),
                             action_space=mdp.info.action_space,
                             gamma=mdp.info.gamma,
                             horizon=horizon)
    agent = alg(pi, mdp_info_agent, features=features, **params)

    return agent
def build_high_level_agent(alg, params, optim, loss, mdp, horizon_low, eps,
                           n_features, use_cuda):
    high = np.ones(4)
    low = np.zeros(4)

    high[:2] = mdp.info.observation_space.high[:2]
    low[:2] = mdp.info.observation_space.low[:2]

    high[2:] = mdp.info.observation_space.high[3:5]
    low[2:] = mdp.info.observation_space.low[3:5]

    n_actions = 9
    observation_space = spaces.Box(low=low, high=high)
    action_space = spaces.Discrete(n_actions)

    mdp_info = MDPInfo(observation_space=observation_space,
                       action_space=action_space,
                       gamma=mdp.info.gamma**horizon_low,
                       horizon=mdp.info.horizon)

    pi = Boltzmann(eps)

    approximator_params = dict(network=Network,
                               optimizer=optim,
                               loss=loss,
                               n_features=n_features,
                               input_shape=mdp_info.observation_space.shape,
                               output_shape=mdp_info.action_space.size,
                               n_actions=mdp_info.action_space.n,
                               use_cuda=use_cuda)

    agent = alg(PyTorchApproximator,
                pi,
                mdp_info,
                approximator_params=approximator_params,
                **params)

    return agent
    def __init__(self):
        self.__name__ = 'InvertedPendulum'

        # MDP parameters
        self.max_degree = np.inf
        self.max_angular_velocity = np.inf
        high = np.array([self.max_degree, self.max_angular_velocity])
        self._g = 9.8
        self._m = 2.
        self._M = 8.
        self._l = .5
        self._alpha = 1. / (self._m + self._M)
        self._dt = .1
        self._discrete_actions = [-50., 0., 50.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(3)
        horizon = 3000
        gamma = .95
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(InvertedPendulum, self).__init__(mdp_info)
Beispiel #27
0
def experiment():
    np.random.seed()

    # Model Block
    mdp = ShipSteeringMultiGate()

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)', phi=phi)

    # Function Block 2
    function_block2 = squarednormBlock(name='f2 (squared norm)')

    # Function Block 3
    function_block3 = addBlock(name='f3 (summation)')

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([38, 38])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([75, 75]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1)

    # Policy 2
    sigma2 = Parameter(value=.01)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(1, ),
                              output_shape=mdp.info.action_space.shape)
    pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2)

    # Agent 1
    learning_rate = AdaptiveParameter(value=10)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, 150, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=50)
    agent1 = GPOMDP(policy=pi1,
                    mdp_info=mdp_info_agent1,
                    params=agent_params,
                    features=features)

    # Agent 2
    learning_rate = AdaptiveParameter(value=.001)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = GPOMDP(policy=pi2,
                    mdp_info=mdp_info_agent2,
                    params=agent_params,
                    features=None)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=5,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    dataset_callback = CollectDataset()
    parameter_callback2 = CollectPolicyParameter(pi2)
    control_block2 = ControlBlock(
        name='Control Block 2',
        agent=agent2,
        n_eps_per_fit=10,
        callbacks=[dataset_callback, parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, control_block1, control_block2, function_block1,
        function_block2, function_block3, reward_acc
    ]
    #order = [0, 1, 7, 2, 4, 5, 6, 3]
    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)
    function_block3.add_input(function_block2)
    function_block3.add_input(reward_ph)
    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block3)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    #dataset_learn_visual = core.learn(n_episodes=2000)
    dataset_learn_visual = list()
    for n in range(4):
        dataset_learn = core.learn(n_episodes=500)
        last_ep_dataset = pick_last_ep(dataset_learn)
        dataset_learn_visual += last_ep_dataset
        del dataset_learn

    # Evaluate
    dataset_eval = core.evaluate(n_episodes=10)

    # Visualize
    low_level_dataset = dataset_callback.get()
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()
    visualize_policy_params(parameter_dataset1, parameter_dataset2)
    visualize_control_block(low_level_dataset, ep_count=20)
    visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4)

    visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4)
    plt.show()

    return
Beispiel #28
0
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # FeaturesH
    low_hi = 0
    lim_hi = 1000 + 1e-8
    n_tiles_high = [20, 20]
    n_tilings = 1

    # Discretization Block
    discretization_block = DiscretizationBlock(low=low_hi,
                                               high=lim_hi,
                                               n_tiles=n_tiles_high)

    # PolicyH
    epsilon = Parameter(value=0.1)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = params.get('learning_rate_high')

    mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(
        n_tiles_high[0] * n_tiles_high[1]),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)

    agentH = alg_high(policy=piH,
                      mdp_info=mdp_info_agentH,
                      learning_rate=learning_rate,
                      lambda_coeff=0.9)

    epsilon_update = EpsilonUpdate(piH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 10]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 3

    tilingsL = Tiles.generate(n_tilings=n_tilings,
                              n_tiles=n_tiles,
                              low=low,
                              high=high)

    featuresL = Features(tilings=tilingsL)

    mdp_info_agentL = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )),
                              action_space=mdp.info.action_space,
                              gamma=0.99,
                              horizon=10000)

    # Approximators
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)

    # Policy1
    std1 = np.array([3e-2])
    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1)

    # Policy2
    std2 = np.array([3e-2])
    pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2)

    # Agent1
    learning_rate1 = params.get('learning_rate_low')
    agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = params.get('learning_rate_low')
    agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir='+')
    termination_condition2 = TerminationCondition(active_dir='x')

    low_ep_per_fit = params.get('low_ep_per_fit')

    # Control Block +
    control_block_plus = ControlBlock(
        name='control block 1',
        agent=agent1,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition1)

    # Control Block x
    control_block_cross = ControlBlock(
        name='control block 2',
        agent=agent2,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate, name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low, name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    # Selector Block
    function_block8 = fBlock(phi=selector_function, name='f7 G_lo')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block_plus])
    mux_block.add_block_list([control_block_cross])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, function_block8,
        reward_acc_H, discretization_block
    ]

    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block_plus)
    reward_acc_H.add_alarm_connection(control_block_cross)

    control_blockH.add_input(discretization_block)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block_plus)
    control_blockH.add_alarm_connection(control_block_cross)

    mux_block.add_input(function_block8)
    mux_block.add_input(function_block2)

    control_block_plus.add_reward(function_block5)
    control_block_cross.add_reward(function_block5)

    function_block1.add_input(state_ph)

    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)

    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block_plus)
    function_block3.add_alarm_connection(control_block_cross)

    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)

    function_block5.add_input(function_block7)

    function_block6.add_input(reward_ph)

    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    function_block8.add_input(control_blockH)

    discretization_block.add_input(function_block1)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    # print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    dataset_eval += dataset_eval_run
    print('J at start : ' + str(np.mean(J)))
    for n in range(n_runs):
        print('ITERATION', n)

        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

        dataset_plus = control_block_plus.dataset.get()
        J_plus = compute_J(dataset_plus, mdp.info.gamma)
        dataset_cross = control_block_cross.dataset.get()
        J_cross = compute_J(dataset_cross, mdp.info.gamma)

        low_level_dataset_eval1.append(dataset_plus)
        low_level_dataset_eval2.append(dataset_cross)

        print('J ll PLUS at iteration  ' + str(n) + ': ' +
              str(np.mean(J_plus)))
        print('J ll CROSS at iteration ' + str(n) + ': ' +
              str(np.mean(J_cross)))
        if n == 4:
            control_blockH.callbacks = [epsilon_update]

    # Tile data
    hi_lev_params = agentH.Q.table
    max_q_val = np.zeros(n_tiles_high[0]**2)
    act_max_q_val = np.zeros(n_tiles_high[0]**2)
    for n in range(n_tiles_high[0]**2):
        max_q_val[n] = np.amax(hi_lev_params[n])
        act_max_q_val[n] = np.argmax(hi_lev_params[n])

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset1_file',
            low_level_dataset_eval1)
    np.save(subdir + str(i) + '/low_level_dataset2_file',
            low_level_dataset_eval2)
    np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val)
    np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)

    return
Beispiel #29
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=pos_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([255, 255])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([500, 500]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1)

    # Policy 2
    pi2 = DeterministicControlPolicy(weights=np.array([0]))
    mu2 = np.zeros(pi2.weights_size)
    sigma2 = 1e-3 * np.ones(pi2.weights_size)
    distribution2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 1
    learning_rate1 = params.get('learning_rate_high')
    lim = 1000
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent1 = alg_high(policy=pi1,
                      mdp_info=mdp_info_agent1,
                      learning_rate=learning_rate1,
                      features=features)

    # Agent 2
    learning_rate2 = params.get('learning_rate_low')
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = alg_low(distribution=distribution2,
                     policy=pi2,
                     mdp_info=mdp_info_agent2,
                     learning_rate=learning_rate2)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(distribution2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=10,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)

    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block2)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        low_level_dataset_eval += control_block2.dataset.get()

    # Save
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset_file',
            low_level_dataset_eval)
    np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1)
    np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
Beispiel #30
0
def segway_experiment(alg_high, alg_low, params_high, params_low):

    np.random.seed()

    # Model Block
    mdp = SegwayLinearMotion(goal_distance=1.0)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)',
                             phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Function Block 5
    function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward)


    # Features
    approximator1 = Regressor(LinearApproximator,
                             input_shape=(1,),
                             output_shape=(1,))

    # Policy H
    n_weights = approximator1.weights_size
    mu1 = np.zeros(n_weights)
    sigma1 = 2.0e-2*np.ones(n_weights)
    pi1 = DeterministicPolicy(approximator1)
    dist1 = GaussianDiagonalDistribution(mu1, sigma1)


    # Agent H
    lim = np.pi/2
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(-lim, lim, (1,)),
                              gamma=mdp.info.gamma,
                              horizon=mdp.info.horizon)
    agent_high = alg_high(dist1, pi1, mdp_info_agent1, **params_high)

    # Policy L
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(3,),
                              output_shape=(1,))
    n_weights2 = approximator2.weights_size
    mu2 = np.zeros(n_weights2)
    sigma2 = 2.0*np.ones(n_weights2)
    pi2 = DeterministicControlPolicy(approximator2)
    dist2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent Low
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        low=mdp.info.observation_space.low[1:], #FIXME FALSE
        high=mdp.info.observation_space.high[1:], #FIXME FALSE
        shape=(3,)),
        action_space=mdp.info.action_space,
        gamma=mdp.info.gamma, horizon=mdp.info.horizon)

    agent_low = alg_low(dist2, pi2, mdp_info_agent2, **params_low)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(dist1)
    control_block1 = ControlBlock(name='Control Block High', agent=agent_high,
                                  n_eps_per_fit=n_ep_per_fit*2,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(dist2)
    control_block2 = ControlBlock(name='Control Block Low', agent=agent_low,
                                  n_eps_per_fit=n_ep_per_fit,
                                  callbacks=[parameter_callback2])
    control_block1.set_mask()

    # Algorithm
    blocks = [state_ph, reward_ph, lastaction_ph, control_block1,
              control_block2, function_block1, function_block2,
              function_block3, function_block4, function_block5]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(function_block1)
    control_block1.add_reward(reward_ph)
    control_block2.add_input(function_block2)
    control_block2.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block1)

    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    function_block5.add_input(state_ph)
    function_block4.add_input(function_block3)
    function_block4.add_input(function_block5)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    mask_done = False
    for n in range(n_epochs):
        print('ITERATION', n)

        if n == 2:
            control_block1.unset_mask()
        core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        print('dist H:', dist1.get_parameters())
        print('dist L mu:', dist2.get_parameters()[:3])
        print('dist L sigma:', dist2.get_parameters()[3:])