Beispiel #1
0
    def __init__(self,
                 simulator,
                 dt=0.5,
                 Lambda=0.75,
                 alpha_v=0.1,
                 alpha_u=0.1,
                 num_features=2**20,
                 tile_weight_exponent=0.5,
                 trunc_normal=True,
                 subspaces=[1, 2, 6]):

        self.simulator = simulator
        self.dt = max(dt, self.simulator.dt)

        self.tile_coder = HashingTileCoder(
            self.make_tile_coder(tile_weight_exponent, subspaces),
            num_features)

        initial_thrust_sigma = simulator.max_thrust / 10
        initial_thrust_mu = 0.5
        initial_rcs_sigma = simulator.max_rcs / 6
        initial_rcs_mu = 0.0

        self.critic = Critic(self.tile_coder,
                             Lambda,
                             alpha_v,
                             initial_value=1.0)

        self.thrust_actor = PolicyGradientActor(
            self.tile_coder,
            Lambda,
            alpha_u,
            min_action=0.0,
            max_action=simulator.max_thrust,
            min_sigma=simulator.max_thrust / 64,
            max_sigma=simulator.max_thrust / 2,
            initial_mu=initial_thrust_mu,
            initial_sigma=initial_thrust_sigma,
            trunc_normal=trunc_normal)

        self.rcs_actor = PolicyGradientActor(self.tile_coder,
                                             Lambda,
                                             alpha_u,
                                             min_action=-simulator.max_rcs,
                                             max_action=simulator.max_rcs,
                                             min_sigma=simulator.max_rcs / 32,
                                             max_sigma=simulator.max_rcs,
                                             initial_mu=initial_rcs_mu,
                                             initial_sigma=initial_rcs_sigma,
                                             trunc_normal=trunc_normal)
Beispiel #2
0
    def __init__(self, simulator, dt=0.5, Lambda=0.75, alpha_v=0.1, alpha_u=0.1, num_features=2**20, tile_weight_exponent=0.5, 
                 trunc_normal=True, subspaces=[1,2,6]):

        self.simulator = simulator
        self.dt = max(dt, self.simulator.dt)

        self.tile_coder = HashingTileCoder (self.make_tile_coder(tile_weight_exponent, subspaces), num_features)

        initial_thrust_sigma = simulator.max_thrust / 10
        initial_thrust_mu = 0.5
        initial_rcs_sigma = simulator.max_rcs / 6
        initial_rcs_mu = 0.0

        self.critic = Critic (self.tile_coder, Lambda, alpha_v, initial_value=1.0)

        self.thrust_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, 
                                                 min_action=0.0, max_action=simulator.max_thrust, 
                                                 min_sigma=simulator.max_thrust/64, max_sigma=simulator.max_thrust/2,
                                                 initial_mu=initial_thrust_mu, initial_sigma=initial_thrust_sigma, 
                                                 trunc_normal=trunc_normal)

        self.rcs_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, 
                                              min_action=-simulator.max_rcs, max_action=simulator.max_rcs,
                                              min_sigma=simulator.max_rcs/32, max_sigma=simulator.max_rcs,
                                              initial_mu=initial_rcs_mu, initial_sigma=initial_rcs_sigma,
                                              trunc_normal=trunc_normal)
Beispiel #3
0
class PolicyGradientAgent:

    def __init__(self, simulator, dt=0.5, Lambda=0.75, alpha_v=0.1, alpha_u=0.1, num_features=2**20, tile_weight_exponent=0.5, 
                 trunc_normal=True, subspaces=[1,2,6]):

        self.simulator = simulator
        self.dt = max(dt, self.simulator.dt)

        self.tile_coder = HashingTileCoder (self.make_tile_coder(tile_weight_exponent, subspaces), num_features)

        initial_thrust_sigma = simulator.max_thrust / 10
        initial_thrust_mu = 0.5
        initial_rcs_sigma = simulator.max_rcs / 6
        initial_rcs_mu = 0.0

        self.critic = Critic (self.tile_coder, Lambda, alpha_v, initial_value=1.0)

        self.thrust_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, 
                                                 min_action=0.0, max_action=simulator.max_thrust, 
                                                 min_sigma=simulator.max_thrust/64, max_sigma=simulator.max_thrust/2,
                                                 initial_mu=initial_thrust_mu, initial_sigma=initial_thrust_sigma, 
                                                 trunc_normal=trunc_normal)

        self.rcs_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, 
                                              min_action=-simulator.max_rcs, max_action=simulator.max_rcs,
                                              min_sigma=simulator.max_rcs/32, max_sigma=simulator.max_rcs,
                                              initial_mu=initial_rcs_mu, initial_sigma=initial_rcs_sigma,
                                              trunc_normal=trunc_normal)

    def make_tile_coder (self, tile_weight_exponent, subspaces):
        #                            xpos   ypos  xvel  yvel        rot     rotvel
        state_signed  = np.array ([ False, False, True, True,      True,      True ])
        state_bounded = np.array ([  True,  True, True, True,     False,      True ])
        tile_size     = np.array ([    5.,    5.,   2.,   2., math.pi/2, math.pi/6 ])
        num_tiles     = np.array ([     6,     4,    4,    4,         2,         3 ])
        num_offsets   = np.array ([     2,     2,    4,    4,         8,         4 ])

        self.max_state = (tile_size * num_tiles) - 1e-8

        self.min_state = -self.max_state
        self.min_state[np.logical_not(state_signed)] = 0.0

        self.max_clip_state = self.max_state.copy()
        self.max_clip_state[np.logical_not(state_bounded)] = float('inf')

        self.min_clip_state = -self.max_clip_state
        self.min_clip_state[np.logical_not(state_signed)] = 0.0

        num_tiles[state_signed] *= 2
        num_tiles[state_bounded] += 1

        return TileCoder (tile_size, num_tiles, num_offsets, subspaces, tile_weight_exponent)

    def compute_action (self, features):

        # def clamp (value, low, high):
        #     value = low + math.fmod (abs(value-low), 2*(high-low))
        #     if value > high: value = 2*high - value
        #     return value

        # thrust = clamp (self.thrust_actor.act(features), 0.0, self.simulator.max_thrust)
        # rcs = clamp (self.rcs_actor.act(features), -self.simulator.max_rcs, self.simulator.max_rcs)
        thrust = self.thrust_actor.act(features)
        rcs = self.rcs_actor.act(features)
        return (thrust, rcs)

    def initialize (self, state):

        features = self.tile_coder.indices (state.clip (self.min_clip_state, self.max_clip_state))

        self.critic.initialize(features)
        self.thrust_actor.initialize()
        self.rcs_actor.initialize()

        return self.compute_action (features)

    def update (self, state, reward, terminal=False, learn=True):

        features = self.tile_coder.indices (state.clip (self.min_clip_state, self.max_clip_state))

        if learn:
            td_error = self.critic.evaluate (features, reward, terminal)
            self.thrust_actor.learn (td_error)
            self.rcs_actor.learn (td_error)

        return self.compute_action (features)

    def get_state(self):
        return np.vstack ((self.critic.value.weights,
                           self.thrust_actor.mu.weights,
                           self.thrust_actor.sigma.weights,
                           self.rcs_actor.mu.weights,
                           self.rcs_actor.sigma.weights))

    def set_state(self, state):
        state.shape = (5, self.tile_coder.num_features)
        (self.critic.value.weights,
         self.thrust_actor.mu.weights,
         self.thrust_actor.sigma.weights,
         self.rcs_actor.mu.weights,
         self.rcs_actor.sigma.weights) = state

    def save_state (self, savefile='data/saved_state.npy'):
        np.save (savefile, self.get_state())

    def load_state (self, savefile='data/saved_state.npy', mmap_mode=None):
        state = np.array (np.load (savefile, mmap_mode), copy=False)
        self.set_state(state)

    def persist_state(self, savefile=None, readonly=False):
        if savefile == None:
            state = np.frombuffer(mp.RawArray(ctypes.c_double, 5*self.tile_coder.num_features))
            state[:] = self.get_state().flat
            self.set_state(state)
        else:
            if not readonly: self.save_state(savefile)
            self.load_state (savefile, mmap_mode='r' if readonly else 'r+')
Beispiel #4
0
class PolicyGradientAgent:
    def __init__(self,
                 simulator,
                 dt=0.5,
                 Lambda=0.75,
                 alpha_v=0.1,
                 alpha_u=0.1,
                 num_features=2**20,
                 tile_weight_exponent=0.5,
                 trunc_normal=True,
                 subspaces=[1, 2, 6]):

        self.simulator = simulator
        self.dt = max(dt, self.simulator.dt)

        self.tile_coder = HashingTileCoder(
            self.make_tile_coder(tile_weight_exponent, subspaces),
            num_features)

        initial_thrust_sigma = simulator.max_thrust / 10
        initial_thrust_mu = 0.5
        initial_rcs_sigma = simulator.max_rcs / 6
        initial_rcs_mu = 0.0

        self.critic = Critic(self.tile_coder,
                             Lambda,
                             alpha_v,
                             initial_value=1.0)

        self.thrust_actor = PolicyGradientActor(
            self.tile_coder,
            Lambda,
            alpha_u,
            min_action=0.0,
            max_action=simulator.max_thrust,
            min_sigma=simulator.max_thrust / 64,
            max_sigma=simulator.max_thrust / 2,
            initial_mu=initial_thrust_mu,
            initial_sigma=initial_thrust_sigma,
            trunc_normal=trunc_normal)

        self.rcs_actor = PolicyGradientActor(self.tile_coder,
                                             Lambda,
                                             alpha_u,
                                             min_action=-simulator.max_rcs,
                                             max_action=simulator.max_rcs,
                                             min_sigma=simulator.max_rcs / 32,
                                             max_sigma=simulator.max_rcs,
                                             initial_mu=initial_rcs_mu,
                                             initial_sigma=initial_rcs_sigma,
                                             trunc_normal=trunc_normal)

    def make_tile_coder(self, tile_weight_exponent, subspaces):
        #                            xpos   ypos  xvel  yvel        rot     rotvel
        state_signed = np.array([False, False, True, True, True, True])
        state_bounded = np.array([True, True, True, True, False, True])
        tile_size = np.array([5., 5., 2., 2., math.pi / 2, math.pi / 6])
        num_tiles = np.array([6, 4, 4, 4, 2, 3])
        num_offsets = np.array([2, 2, 4, 4, 8, 4])

        self.max_state = (tile_size * num_tiles) - 1e-8

        self.min_state = -self.max_state
        self.min_state[np.logical_not(state_signed)] = 0.0

        self.max_clip_state = self.max_state.copy()
        self.max_clip_state[np.logical_not(state_bounded)] = float('inf')

        self.min_clip_state = -self.max_clip_state
        self.min_clip_state[np.logical_not(state_signed)] = 0.0

        num_tiles[state_signed] *= 2
        num_tiles[state_bounded] += 1

        return TileCoder(tile_size, num_tiles, num_offsets, subspaces,
                         tile_weight_exponent)

    def compute_action(self, features):

        # def clamp (value, low, high):
        #     value = low + math.fmod (abs(value-low), 2*(high-low))
        #     if value > high: value = 2*high - value
        #     return value

        # thrust = clamp (self.thrust_actor.act(features), 0.0, self.simulator.max_thrust)
        # rcs = clamp (self.rcs_actor.act(features), -self.simulator.max_rcs, self.simulator.max_rcs)
        thrust = self.thrust_actor.act(features)
        rcs = self.rcs_actor.act(features)
        return (thrust, rcs)

    def initialize(self, state):

        features = self.tile_coder.indices(
            state.clip(self.min_clip_state, self.max_clip_state))

        self.critic.initialize(features)
        self.thrust_actor.initialize()
        self.rcs_actor.initialize()

        return self.compute_action(features)

    def update(self, state, reward, terminal=False, learn=True):

        features = self.tile_coder.indices(
            state.clip(self.min_clip_state, self.max_clip_state))

        if learn:
            td_error = self.critic.evaluate(features, reward, terminal)
            self.thrust_actor.learn(td_error)
            self.rcs_actor.learn(td_error)

        return self.compute_action(features)

    def get_state(self):
        return np.vstack(
            (self.critic.value.weights, self.thrust_actor.mu.weights,
             self.thrust_actor.sigma.weights, self.rcs_actor.mu.weights,
             self.rcs_actor.sigma.weights))

    def set_state(self, state):
        state.shape = (5, self.tile_coder.num_features)
        (self.critic.value.weights, self.thrust_actor.mu.weights,
         self.thrust_actor.sigma.weights, self.rcs_actor.mu.weights,
         self.rcs_actor.sigma.weights) = state

    def save_state(self, savefile='data/saved_state.npy'):
        np.save(savefile, self.get_state())

    def load_state(self, savefile='data/saved_state.npy', mmap_mode=None):
        state = np.array(np.load(savefile, mmap_mode), copy=False)
        self.set_state(state)

    def persist_state(self, savefile=None, readonly=False):
        if savefile == None:
            state = np.frombuffer(
                mp.RawArray(ctypes.c_double, 5 * self.tile_coder.num_features))
            state[:] = self.get_state().flat
            self.set_state(state)
        else:
            if not readonly: self.save_state(savefile)
            self.load_state(savefile, mmap_mode='r' if readonly else 'r+')