コード例 #1
0
    def run_episode(self, Q : Tabular, task : Task, policy : Policy):
        
        # to compute backup
        rewards = np.zeros(self.episode_length, dtype=float)
        
        # initialize state
        state = task.initial_state()
            
        # repeat for each step of episode
        for t in range(self.episode_length):
                
            # choose action from state using policy derived from Q
            action = policy.act(Q, task, state) 
                
            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action) 
            rewards[t] = reward  
                
            # update Q
            delta = reward + self.gamma * Q.max_value(new_state) - Q.values(state)[action]
            Q.update(state, action, delta)
                
            # update state
            state = new_state
            
            # until state is terminal
            if done:
                break

        return t, rewards[0:t]
コード例 #2
0
    def run_episode(self, Q: Tabular, task: Task, policy: Policy):

        # to compute backups
        rewards = np.zeros(self.episode_length, dtype=float)
        states = [None] * self.episode_length
        actions = [None] * self.episode_length

        # initialize state
        state = task.initial_state()

        # repeat for each step of episode
        for t in range(self.episode_length):

            # choose action from state using policy derived from Q
            action = policy.act(Q, task, state)
            states[t], actions[t] = state, action

            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action)
            rewards[t] = reward

            # update state and action
            state = new_state

            # until state is terminal
            if done:
                break

        # initialize lambda-average of returns
        lambda_return = reward

        # repeat for each step of episode in reverse order
        T = t
        for t in range(T, -1, -1):

            # compute lambda-average of returns
            if t < T:
                lambda_return = rewards[t] + self.gamma * (
                    (1.0 - self.decay) *
                    Q.values(states[t + 1])[actions[t + 1]] +
                    self.decay * lambda_return)

            # update Q
            delta = lambda_return - Q.values(states[t])[actions[t]]
            Q.update(states[t], actions[t], delta)

        return T, rewards[0:T]
コード例 #3
0
    def run_episode(self, Q: Tabular, task: Task, policy: Policy):

        # to compute backups
        rewards = np.zeros(self.episode_length, dtype=float)

        # initialize the e(s, a) matrix
        # note: there is an error in Sutton and Barto since e is reset each episode
        e = defaultdict(lambda: np.zeros(task.valid_actions(), dtype=float))

        # initialize state and action
        state = task.initial_state()
        action = policy.act(Q, task, state)

        # repeat for each step of episode
        for t in range(self.episode_length):

            # take action and observe reward and new state
            new_state, reward, done = task.transition(state, action)
            rewards[t] = reward

            # choose action from state using policy derived from Q
            new_action = policy.act(Q, task, new_state)

            # update e
            e[state][action] += 1.0

            # update trace
            delta = reward + self.gamma * Q.values(
                new_state)[new_action] - Q.values(state)[action]
            for s in e.keys():
                errors = e[s] * delta
                Q.update_all(s, errors)
                e[s] *= self.gamma * self.decay

            # update state and action
            state, action = new_state, new_action

            # until state is terminal
            if done:
                break

        return t, rewards[0:t]
コード例 #4
0
ファイル: MonteCarlo.py プロジェクト: mike-gimelfarb/mfpy
    def run_episode(self, Q: Tabular, task: Task, policy: Policy):

        # to compute backup
        rewards = np.zeros(self.episode_length, dtype=float)

        # simulate a trajectory
        t, episode = self.sample_episode(Q, task, policy, rewards)

        # repeat for each step of episode
        G = 0.0
        for state, action, reward in episode[::-1]:

            # update cumulative reward
            G = reward + self.gamma * G

            # update Q
            delta = G - Q.values(state)[action]
            Q.update(state, action, delta)

        return t, rewards[0:t]
コード例 #5
0
 def run_episode(self, Q : Tabular, task : Task, epsilon : Epsilon, alpha : LearningRate):
 
     # to compute backup
     rewards = np.zeros(self.episode_length, dtype=np.float32)
     epsilons = np.zeros(self.episode_length, dtype=np.float32)
     
     # initialize state
     state = task.initial_state()
     
     # choose action from state using policy derived from Q
     epsilon_t = epsilon.get_epsilon(state)
     action = self.epsilon_greedy(Q, task, state, epsilon_t) 
     
     # repeat for each step of episode
     for t in range(self.episode_length):
             
         # take action and observe reward and new state
         new_state, reward, done = task.transition(state, action) 
         rewards[t] = reward  
         
         # choose new action from new state using policy derived from Q
         epsilons[t] = epsilon_t = epsilon.get_epsilon(new_state)
         new_action = self.epsilon_greedy(Q, task, new_state, epsilon_t) 
         
         # compute model means for exploration
         G_Q = reward + self.gamma * Q.max_value(new_state)
         G_U = reward + self.gamma * np.mean(Q.values(new_state))
         
         # update Q     
         old_Q = Q.values(state)[action]
         delta = (1.0 - epsilon_t) * G_Q + epsilon_t * G_U - Q.values(state)[action]
         Q.alpha = alpha.get_alpha(state) 
         Q.update(state, action, delta)
         new_Q = Q.values(state)[action]
         
         # update epsilon
         epsilon.update_from_experts(state, data=(G_Q, G_U, new_Q - old_Q)) 
         
         # update learning rate
         alpha.update_alpha(state, None)
         
         # update state and action
         state, action = new_state, new_action
     
         # until state is terminal
         if done:
             break
     
     return t + 1, rewards[0:t + 1], epsilons[0:t]
コード例 #6
0
 def evaluate(self, Q: Tabular, task: Task):
     steps = self.episode_length
     rewards = np.zeros(steps, dtype=np.float32)
     state = task.initial_state(training=False)
     # print(state)
     for t in range(steps):
         action = Q.max_action(state)
         new_state, reward, done = task.transition(state, action)
         #   print('state = {}, reward = {}'.format(new_state, reward))
         rewards[t] = reward
         if done:
             break
         state = new_state
     gamma = self.gamma
     result = 0.0
     for s in range(t, -1, -1):
         result = rewards[s] + gamma * result
     return result, t + 1
コード例 #7
0
def main():

    # set up the domain
    maze = np.array([[0, 0, 0, 0, 5], [0, 2, 0, 0, 0], [0, 0, 0, 0, 0],
                     [3, 0, 0, 0, 1], [0, 0, 0, 0, 4]])
    domain = TourDeFlags(maze, (4, 0))

    # See "Policy invariance under reward transformations:
    # Theory and application to reward shaping" (Ng et al., 1999)
    zero_shape = lambda state: 0.0
    heuristic_shape = lambda state: -22.0 * ((5.0 - state[2] - 0.5) / 5.0)
    bad_shape = lambda state: -heuristic_shape(state)
    random_shape = lambda state: random.random() * 40.0 - 20.0

    def good_shape(state):
        row, col, flags = state
        if flags == 0:
            return -abs(row - 3.0) - abs(col - 4.0) - 17.0
        elif flags == 1:
            return -abs(row - 1.0) - abs(col - 1.0) - 12.0
        elif flags == 2:
            return -abs(row - 3.0) - abs(col - 0.0) - 9.0
        elif flags == 3:
            return -abs(row - 4.0) - abs(col - 4.0) - 4.0
        elif flags == 4:
            return -abs(row - 0.0) - abs(col - 4.0)
        else:
            return 0.0

    # set up the experts
    experts = [
        zero_shape, heuristic_shape, good_shape, random_shape, bad_shape
    ]

    # we will define the success criterion and stopping rule
    def steps_to_goal(tdf, policy, enc):
        _, _, steps = tdf.rewards(tdf.max_steps, policy, 1.0, enc)
        return steps, False

    # decide here whether we will use deep learning or tabular
    use_deep = True

    if use_deep:

        # one-hot encoding
        encoding = domain.default_encoding

        # set up the neural network as the function approximator
        input_dim = domain.width + domain.height + (1 + domain.goals)

        def network_initializer():
            model = Sequential()
            model.add(Dense(25, input_shape=(input_dim, )))
            model.add(LeakyReLU())
            model.add(Dense(25))
            model.add(LeakyReLU())
            model.add(Dense(4, activation='linear'))
            model.compile(optimizer=Adam(0.001), loss='mse')
            return model

        # set up the learning agent
        agent = DeepQ(state_size=input_dim,
                      action_size=4,
                      discount=1.0,
                      build_model=network_initializer,
                      batch_size=16,
                      epochs=5,
                      memory_size=10000)

        # set up the exploration schedule for the greedy epsilon policy
        eps_schedule = DecayingEpsilon(0.08, 0.08, 1.0)

    else:

        # no encoding
        encoding = lambda state: state

        # set up the tabular
        agent = Tabular(state_size=3,
                        action_size=4,
                        method='sarsa',
                        discount=1.0,
                        learning_rate=0.36,
                        lr_decay=1.0)

        # set up the exploration schedule for the greedy epsilon policy
        eps_schedule = DecayingEpsilon(1.0, 0.0, 0.98)

    # finally create the training algorithm
    training = Training(domain, agent, encoding)

    # we start by training for 1 trial on all the experts and sarsa
    _ = training.run_many(trials=1,
                          episodes=200,
                          time_limit=200,
                          exploration=eps_schedule,
                          experts=experts,
                          measure=steps_to_goal,
                          online=1,
                          offline=0)
コード例 #8
0
def main():

    # create the domain
    domain = CartPole()

    # use default encoding for the states
    encoding = domain.default_encoding

    # let's import the pre-trained networks to use as the experts
    model_good = keras.models.load_model(
        '/home/michael/eclipse-workspace/RewardShaping/domains/cartpole_model_6_by_2.h5'
    )

    def predict(state):
        state = np.reshape(state, (1, -1))
        return model_good.predict(state)[0]

    # now we will define the shaping functions from these networks
    ok_shape = lambda state: df * (1.0 - abs(state[2]) / FIFTEEN_DEGREES)
    good_shape = lambda state: np.amax(predict(state))
    bad_shape = lambda state: -good_shape(state)
    random_shape = lambda state: np.random.random() * 2.0 * df - df
    zero_shape = lambda state: 0.0

    # we start by training for 10 trials on all the experts and sarsa
    experts = [zero_shape, ok_shape, good_shape, bad_shape, random_shape]

    # we will define the success criterion and stopping rule
    def steps_balanced(cartpole, policy, enc):
        _, _, steps = cartpole.rewards(500, policy, discount, enc)
        return steps, False

    # decide here whether we will use deep learning or tabular
    use_deep = True

    if use_deep:

        # set up the neural network as the function approximator
        def build_model():
            model = Sequential()
            model.add(
                Dense(12,
                      input_dim=4,
                      activation='relu',
                      kernel_regularizer=l2(1e-6)))
            model.add(Dense(12, activation='relu',
                            kernel_regularizer=l2(1e-6)))
            model.add(
                Dense(2, activation='linear', kernel_regularizer=l2(1e-6)))
            model.compile(loss='mse', optimizer=Adam(lr=0.0005))
            return model

        # set up the exploration schedule for the greedy epsilon policy
        eps_schedule = DecayingEpsilon(1.0, 0.01, 0.98)

        # set up the learning agent
        agent = DeepQ(state_size=4,
                      action_size=2,
                      discount=discount,
                      build_model=build_model,
                      batch_size=32,
                      epochs=1,
                      memory_size=10000)

        # finally create the training algorithm
        training = Training(domain, agent, encoding)

        # train
        _ = training.run_many(trials=1,
                              episodes=300,
                              time_limit=500,
                              exploration=eps_schedule,
                              experts=experts,
                              measure=steps_balanced,
                              online=0,
                              offline=100)

    else:

        encoding = domain.discretize
        agent = Tabular(state_size=4,
                        action_size=2,
                        discount=discount,
                        method='q',
                        learning_rate=0.5,
                        lr_decay=0.99)

        # set up the exploration schedule for the greedy epsilon policy
        eps_schedule = DecayingEpsilon(1.0, 0.01, 0.98)

        # finally create the training algorithm
        training = Training(domain, agent, encoding)

        # train
        _ = training.run_many(trials=1,
                              episodes=300,
                              time_limit=500,
                              exploration=eps_schedule,
                              experts=experts,
                              measure=steps_balanced,
                              online=1,
                              offline=0)
コード例 #9
0
    from keras import backend as K
    K.clear_session()

    model = Sequential()
    model.add(Dense(25, input_dim=sum(maze.shape) + 1 + num_goals))
    model.add(Activation('relu'))
    model.add(Dense(25))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('linear'))
    model.compile(loss='mse', optimizer=Adam(0.001))
    return model


# agent
agent = Tabular(domain.valid_actions(),
                randomizer=lambda n: np.random.randn(n) * 0.1)
# agent = DQN(sum(maze.shape) + 1 + num_goals, 4, model_lambda=model_lambda,
#            batch_size=24, epochs=5, memory_size=2000)

# algorithm
trainer = ExpectedSarsa(discount=0.99, episode_length=200)
# trainer = DeepQLearning(discount=0.99, episode_length=200, encoding=domain.default_encoding)

# run
perf, eps = trainer.train_many(
    agent,
    domain,
    # VDBE(0.5, 1.0 / 4, 0.05),
    BMCRobust(mu=0, tau=1, a=500, b=500, alpha=1, beta=1 + 0.01),
    FixedLearningRate(0.7),
    episodes=500,