def run_episode(self, Q : Tabular, task : Task, policy : Policy): # to compute backup rewards = np.zeros(self.episode_length, dtype=float) # initialize state state = task.initial_state() # repeat for each step of episode for t in range(self.episode_length): # choose action from state using policy derived from Q action = policy.act(Q, task, state) # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward # update Q delta = reward + self.gamma * Q.max_value(new_state) - Q.values(state)[action] Q.update(state, action, delta) # update state state = new_state # until state is terminal if done: break return t, rewards[0:t]
def run_episode(self, Q: Tabular, task: Task, policy: Policy): # to compute backups rewards = np.zeros(self.episode_length, dtype=float) states = [None] * self.episode_length actions = [None] * self.episode_length # initialize state state = task.initial_state() # repeat for each step of episode for t in range(self.episode_length): # choose action from state using policy derived from Q action = policy.act(Q, task, state) states[t], actions[t] = state, action # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward # update state and action state = new_state # until state is terminal if done: break # initialize lambda-average of returns lambda_return = reward # repeat for each step of episode in reverse order T = t for t in range(T, -1, -1): # compute lambda-average of returns if t < T: lambda_return = rewards[t] + self.gamma * ( (1.0 - self.decay) * Q.values(states[t + 1])[actions[t + 1]] + self.decay * lambda_return) # update Q delta = lambda_return - Q.values(states[t])[actions[t]] Q.update(states[t], actions[t], delta) return T, rewards[0:T]
def run_episode(self, Q: Tabular, task: Task, policy: Policy): # to compute backups rewards = np.zeros(self.episode_length, dtype=float) # initialize the e(s, a) matrix # note: there is an error in Sutton and Barto since e is reset each episode e = defaultdict(lambda: np.zeros(task.valid_actions(), dtype=float)) # initialize state and action state = task.initial_state() action = policy.act(Q, task, state) # repeat for each step of episode for t in range(self.episode_length): # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward # choose action from state using policy derived from Q new_action = policy.act(Q, task, new_state) # update e e[state][action] += 1.0 # update trace delta = reward + self.gamma * Q.values( new_state)[new_action] - Q.values(state)[action] for s in e.keys(): errors = e[s] * delta Q.update_all(s, errors) e[s] *= self.gamma * self.decay # update state and action state, action = new_state, new_action # until state is terminal if done: break return t, rewards[0:t]
def run_episode(self, Q: Tabular, task: Task, policy: Policy): # to compute backup rewards = np.zeros(self.episode_length, dtype=float) # simulate a trajectory t, episode = self.sample_episode(Q, task, policy, rewards) # repeat for each step of episode G = 0.0 for state, action, reward in episode[::-1]: # update cumulative reward G = reward + self.gamma * G # update Q delta = G - Q.values(state)[action] Q.update(state, action, delta) return t, rewards[0:t]
def run_episode(self, Q : Tabular, task : Task, epsilon : Epsilon, alpha : LearningRate): # to compute backup rewards = np.zeros(self.episode_length, dtype=np.float32) epsilons = np.zeros(self.episode_length, dtype=np.float32) # initialize state state = task.initial_state() # choose action from state using policy derived from Q epsilon_t = epsilon.get_epsilon(state) action = self.epsilon_greedy(Q, task, state, epsilon_t) # repeat for each step of episode for t in range(self.episode_length): # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward # choose new action from new state using policy derived from Q epsilons[t] = epsilon_t = epsilon.get_epsilon(new_state) new_action = self.epsilon_greedy(Q, task, new_state, epsilon_t) # compute model means for exploration G_Q = reward + self.gamma * Q.max_value(new_state) G_U = reward + self.gamma * np.mean(Q.values(new_state)) # update Q old_Q = Q.values(state)[action] delta = (1.0 - epsilon_t) * G_Q + epsilon_t * G_U - Q.values(state)[action] Q.alpha = alpha.get_alpha(state) Q.update(state, action, delta) new_Q = Q.values(state)[action] # update epsilon epsilon.update_from_experts(state, data=(G_Q, G_U, new_Q - old_Q)) # update learning rate alpha.update_alpha(state, None) # update state and action state, action = new_state, new_action # until state is terminal if done: break return t + 1, rewards[0:t + 1], epsilons[0:t]
def evaluate(self, Q: Tabular, task: Task): steps = self.episode_length rewards = np.zeros(steps, dtype=np.float32) state = task.initial_state(training=False) # print(state) for t in range(steps): action = Q.max_action(state) new_state, reward, done = task.transition(state, action) # print('state = {}, reward = {}'.format(new_state, reward)) rewards[t] = reward if done: break state = new_state gamma = self.gamma result = 0.0 for s in range(t, -1, -1): result = rewards[s] + gamma * result return result, t + 1
def main(): # set up the domain maze = np.array([[0, 0, 0, 0, 5], [0, 2, 0, 0, 0], [0, 0, 0, 0, 0], [3, 0, 0, 0, 1], [0, 0, 0, 0, 4]]) domain = TourDeFlags(maze, (4, 0)) # See "Policy invariance under reward transformations: # Theory and application to reward shaping" (Ng et al., 1999) zero_shape = lambda state: 0.0 heuristic_shape = lambda state: -22.0 * ((5.0 - state[2] - 0.5) / 5.0) bad_shape = lambda state: -heuristic_shape(state) random_shape = lambda state: random.random() * 40.0 - 20.0 def good_shape(state): row, col, flags = state if flags == 0: return -abs(row - 3.0) - abs(col - 4.0) - 17.0 elif flags == 1: return -abs(row - 1.0) - abs(col - 1.0) - 12.0 elif flags == 2: return -abs(row - 3.0) - abs(col - 0.0) - 9.0 elif flags == 3: return -abs(row - 4.0) - abs(col - 4.0) - 4.0 elif flags == 4: return -abs(row - 0.0) - abs(col - 4.0) else: return 0.0 # set up the experts experts = [ zero_shape, heuristic_shape, good_shape, random_shape, bad_shape ] # we will define the success criterion and stopping rule def steps_to_goal(tdf, policy, enc): _, _, steps = tdf.rewards(tdf.max_steps, policy, 1.0, enc) return steps, False # decide here whether we will use deep learning or tabular use_deep = True if use_deep: # one-hot encoding encoding = domain.default_encoding # set up the neural network as the function approximator input_dim = domain.width + domain.height + (1 + domain.goals) def network_initializer(): model = Sequential() model.add(Dense(25, input_shape=(input_dim, ))) model.add(LeakyReLU()) model.add(Dense(25)) model.add(LeakyReLU()) model.add(Dense(4, activation='linear')) model.compile(optimizer=Adam(0.001), loss='mse') return model # set up the learning agent agent = DeepQ(state_size=input_dim, action_size=4, discount=1.0, build_model=network_initializer, batch_size=16, epochs=5, memory_size=10000) # set up the exploration schedule for the greedy epsilon policy eps_schedule = DecayingEpsilon(0.08, 0.08, 1.0) else: # no encoding encoding = lambda state: state # set up the tabular agent = Tabular(state_size=3, action_size=4, method='sarsa', discount=1.0, learning_rate=0.36, lr_decay=1.0) # set up the exploration schedule for the greedy epsilon policy eps_schedule = DecayingEpsilon(1.0, 0.0, 0.98) # finally create the training algorithm training = Training(domain, agent, encoding) # we start by training for 1 trial on all the experts and sarsa _ = training.run_many(trials=1, episodes=200, time_limit=200, exploration=eps_schedule, experts=experts, measure=steps_to_goal, online=1, offline=0)
def main(): # create the domain domain = CartPole() # use default encoding for the states encoding = domain.default_encoding # let's import the pre-trained networks to use as the experts model_good = keras.models.load_model( '/home/michael/eclipse-workspace/RewardShaping/domains/cartpole_model_6_by_2.h5' ) def predict(state): state = np.reshape(state, (1, -1)) return model_good.predict(state)[0] # now we will define the shaping functions from these networks ok_shape = lambda state: df * (1.0 - abs(state[2]) / FIFTEEN_DEGREES) good_shape = lambda state: np.amax(predict(state)) bad_shape = lambda state: -good_shape(state) random_shape = lambda state: np.random.random() * 2.0 * df - df zero_shape = lambda state: 0.0 # we start by training for 10 trials on all the experts and sarsa experts = [zero_shape, ok_shape, good_shape, bad_shape, random_shape] # we will define the success criterion and stopping rule def steps_balanced(cartpole, policy, enc): _, _, steps = cartpole.rewards(500, policy, discount, enc) return steps, False # decide here whether we will use deep learning or tabular use_deep = True if use_deep: # set up the neural network as the function approximator def build_model(): model = Sequential() model.add( Dense(12, input_dim=4, activation='relu', kernel_regularizer=l2(1e-6))) model.add(Dense(12, activation='relu', kernel_regularizer=l2(1e-6))) model.add( Dense(2, activation='linear', kernel_regularizer=l2(1e-6))) model.compile(loss='mse', optimizer=Adam(lr=0.0005)) return model # set up the exploration schedule for the greedy epsilon policy eps_schedule = DecayingEpsilon(1.0, 0.01, 0.98) # set up the learning agent agent = DeepQ(state_size=4, action_size=2, discount=discount, build_model=build_model, batch_size=32, epochs=1, memory_size=10000) # finally create the training algorithm training = Training(domain, agent, encoding) # train _ = training.run_many(trials=1, episodes=300, time_limit=500, exploration=eps_schedule, experts=experts, measure=steps_balanced, online=0, offline=100) else: encoding = domain.discretize agent = Tabular(state_size=4, action_size=2, discount=discount, method='q', learning_rate=0.5, lr_decay=0.99) # set up the exploration schedule for the greedy epsilon policy eps_schedule = DecayingEpsilon(1.0, 0.01, 0.98) # finally create the training algorithm training = Training(domain, agent, encoding) # train _ = training.run_many(trials=1, episodes=300, time_limit=500, exploration=eps_schedule, experts=experts, measure=steps_balanced, online=1, offline=0)
from keras import backend as K K.clear_session() model = Sequential() model.add(Dense(25, input_dim=sum(maze.shape) + 1 + num_goals)) model.add(Activation('relu')) model.add(Dense(25)) model.add(Activation('relu')) model.add(Dense(4)) model.add(Activation('linear')) model.compile(loss='mse', optimizer=Adam(0.001)) return model # agent agent = Tabular(domain.valid_actions(), randomizer=lambda n: np.random.randn(n) * 0.1) # agent = DQN(sum(maze.shape) + 1 + num_goals, 4, model_lambda=model_lambda, # batch_size=24, epochs=5, memory_size=2000) # algorithm trainer = ExpectedSarsa(discount=0.99, episode_length=200) # trainer = DeepQLearning(discount=0.99, episode_length=200, encoding=domain.default_encoding) # run perf, eps = trainer.train_many( agent, domain, # VDBE(0.5, 1.0 / 4, 0.05), BMCRobust(mu=0, tau=1, a=500, b=500, alpha=1, beta=1 + 0.01), FixedLearningRate(0.7), episodes=500,