def __init__(self, load_model=False, testing=False): self.critic = self.build_critic() if CONTINUOUS is False: self.actor = self.build_actor() else: self.actor = self.build_actor_continuous() self.env = PongEnvironment() self.episode = 0 self.testing = testing if not self.testing: self.NUM_EPISODE = EPISODES else: self.NUM_EPISODE = 100 self.observation = self.env.reset() self.val = False self.reward = [] self.reward_over_time = [] self.gradient_steps = 0 self.action_noise = NOISE self.load_model = load_model if self.load_model: self.actor.load_weights("./weights/actor_weights.h5") self.critic.load_weights("./weights/critic_weights.h5")
def __init__(self): self.critic = self.build_critic() if CONTINUOUS: self.actor = self.build_actor_continuous() else: self.actor = self.build_actor() # self.env = gym.make(ENV) self.env = PongEnvironment() print(self.env.action_space, 'action_space', self.env.observation_space, 'observation_space') self.episode = 0 self.observation = self.env.reset() self.val = False self.reward = [] self.reward_over_time = [] self.name = self.get_name() self.writer = SummaryWriter(self.name) self.gradient_steps = 0
# Get predict values from actor predicts = self.actor.predict([obs, advantage_values, predictions]) return np.argmax(predicts[0]) def play(self, episode): obs = self.environment.reset() while True: action = self.get_action(obs) next_obs, _, is_done, = self.environment.step(action) if episode == TEST_EPISODE_COUNT - 1: self.environment.render() obs = next_obs if is_done: break def test(self): for episode in range(TEST_EPISODE_COUNT): self.play(episode) if __name__ == '__main__': agent = Agent(PongEnvironment(True)) for test in range(TEST_COUNT): agent.test()
class Agent: def __init__(self): self.critic = self.build_critic() if CONTINUOUS: self.actor = self.build_actor_continuous() else: self.actor = self.build_actor() # self.env = gym.make(ENV) self.env = PongEnvironment() print(self.env.action_space, 'action_space', self.env.observation_space, 'observation_space') self.episode = 0 self.observation = self.env.reset() self.val = False self.reward = [] self.reward_over_time = [] self.name = self.get_name() self.writer = SummaryWriter(self.name) self.gradient_steps = 0 def get_name(self): name = 'AllRuns/' if CONTINUOUS is True: name += 'continous/' else: name += 'discrete/' name += ENV return name def build_actor(self): state_input = Input(shape=(NUM_STATE, )) advantage = Input(shape=(1, )) old_prediction = Input(shape=(NUM_ACTIONS, )) x = Dense(HIDDEN_SIZE, activation='tanh')(state_input) for _ in range(NUM_LAYERS - 1): x = Dense(HIDDEN_SIZE, activation='tanh')(x) out_actions = Dense(NUM_ACTIONS, activation='softmax', name='output')(x) model = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions]) model.compile(optimizer=Adam(lr=LR), loss=[ proximal_policy_optimization_loss( advantage=advantage, old_prediction=old_prediction) ]) model.summary() return model def build_actor_continuous(self): state_input = Input(shape=(NUM_STATE, )) advantage = Input(shape=(1, )) old_prediction = Input(shape=(NUM_ACTIONS, )) x = Dense(HIDDEN_SIZE, activation='tanh')(state_input) for _ in range(NUM_LAYERS - 1): x = Dense(HIDDEN_SIZE, activation='tanh')(x) out_actions = Dense(NUM_ACTIONS, name='output', activation='tanh')(x) model = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions]) model.compile(optimizer=Adam(lr=LR), loss=[ proximal_policy_optimization_loss_continuous( advantage=advantage, old_prediction=old_prediction) ]) model.summary() return model def build_critic(self): state_input = Input(shape=(NUM_STATE, )) x = Dense(HIDDEN_SIZE, activation='tanh')(state_input) for _ in range(NUM_LAYERS - 1): x = Dense(HIDDEN_SIZE, activation='tanh')(x) out_value = Dense(1)(x) model = Model(inputs=[state_input], outputs=[out_value]) model.compile(optimizer=Adam(lr=LR), loss='mse') return model def reset_env(self): self.episode += 1 if self.episode % 100 == 0: self.val = True else: self.val = False self.observation = self.env.reset() self.reward = [] def get_action(self): p = self.actor.predict([ self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION ]) if self.val is False: action = np.random.choice(NUM_ACTIONS, p=np.nan_to_num(p[0])) else: action = np.argmax(p[0]) action_matrix = np.zeros(NUM_ACTIONS) action_matrix[action] = 1 return action, action_matrix, p def get_action_continuous(self): p = self.actor.predict([ self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION ]) if self.val is False: action = action_matrix = p[0] + np.random.normal( loc=0, scale=NOISE, size=p[0].shape) else: action = action_matrix = p[0] return action, action_matrix, p def transform_reward(self): if self.val is True: self.writer.add_scalar('Val episode reward', np.array(self.reward).sum(), self.episode) else: self.writer.add_scalar('Episode reward', np.array(self.reward).sum(), self.episode) for j in range(len(self.reward) - 2, -1, -1): self.reward[j] += self.reward[j + 1] * GAMMA def get_batch(self): batch = [[], [], [], []] tmp_batch = [[], [], []] untransformed_reward = list() while len(batch[0]) < BUFFER_SIZE: if CONTINUOUS: action, action_matrix, predicted_action = self.get_action_continuous( ) else: action, action_matrix, predicted_action = self.get_action() if self.gradient_steps % RENDER_EACH == 0: self.env.render() observation, reward, done = self.env.step(action) # observation, reward, done, info = self.env.step(action) self.reward.append(reward) untransformed_reward.append(reward) tmp_batch[0].append(self.observation) tmp_batch[1].append(action_matrix) tmp_batch[2].append(predicted_action) self.observation = observation if done: self.transform_reward() if self.val is False: for i in range(len(tmp_batch[0])): obs, action, pred = tmp_batch[0][i], tmp_batch[1][ i], tmp_batch[2][i] r = self.reward[i] batch[0].append(obs) batch[1].append(action) batch[2].append(pred) batch[3].append(r) tmp_batch = [[], [], []] self.reset_env() obs, action, pred, reward = np.array(batch[0]), np.array( batch[1]), np.array(batch[2]), np.reshape(np.array(batch[3]), (len(batch[3]), 1)) pred = np.reshape(pred, (pred.shape[0], pred.shape[2])) return obs, action, pred, reward, untransformed_reward def run(self): while self.episode < EPISODES: obs, action, pred, reward, untransformed_reward = self.get_batch() obs, action, pred, reward = obs[: BUFFER_SIZE], action[: BUFFER_SIZE], pred[: BUFFER_SIZE], reward[: BUFFER_SIZE] old_prediction = pred pred_values = self.critic.predict(obs) advantage = reward - pred_values # advantage = (advantage - advantage.mean()) / advantage.std() actor_loss = self.actor.fit([obs, advantage, old_prediction], [action], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False) critic_loss = self.critic.fit([obs], [reward], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False) self.writer.add_scalar('Actor loss', actor_loss.history['loss'][-1], self.gradient_steps) self.writer.add_scalar('Critic loss', critic_loss.history['loss'][-1], self.gradient_steps) # print("Gradient Update:", self.gradient_steps, " Reward: ",sum(reward)) print( f"E: {self.episode}\tgrad. update: {self.gradient_steps}\tReward: {sum(untransformed_reward)}" ) self.gradient_steps += 1
import sys import time import pickle sys.path.append("Environment") from Environment import PongEnvironment from PPO import * if __name__ == '__main__': env = PongEnvironment(False) ppo = PPO(env, num_states=len(env.observe()), actions=np.arange(3)) ppo.saver.restore(ppo.sess, "model_res/Pong_model.ckpt") actions = set() all_scores = list() for trial in range(10): score = 0 for i in range(100): ep_actions = list() done = False s = env.reset() while not done: a = ppo.sess.run( ppo.action, {ppo.in_state: s.reshape(-1, ppo.state_space)})[0] ep_actions.append(a) #env.render() #time.sleep(1e-3) try: s, r, done = env.step(a) except ValueError:
class Agent: def __init__(self, load_model=False, testing=False): self.critic = self.build_critic() if CONTINUOUS is False: self.actor = self.build_actor() else: self.actor = self.build_actor_continuous() self.env = PongEnvironment() self.episode = 0 self.testing = testing if not self.testing: self.NUM_EPISODE = EPISODES else: self.NUM_EPISODE = 100 self.observation = self.env.reset() self.val = False self.reward = [] self.reward_over_time = [] self.gradient_steps = 0 self.action_noise = NOISE self.load_model = load_model if self.load_model: self.actor.load_weights("./weights/actor_weights.h5") self.critic.load_weights("./weights/critic_weights.h5") def build_actor(self): state_input = Input(shape=(NUM_STATE, )) advantage = Input(shape=(1, )) old_prediction = Input(shape=(NUM_ACTIONS, )) x = Dense(HIDDEN_SIZE, activation='tanh')(state_input) for _ in range(NUM_LAYERS - 1): x = Dense(HIDDEN_SIZE, activation='tanh')(x) out_actions = Dense(NUM_ACTIONS, activation='softmax', name='output')(x) model = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions]) model.compile(optimizer=Adam(lr=LR), loss=[ self.proximal_policy_optimization_loss( advantage=advantage, old_prediction=old_prediction) ]) model.summary() return model def build_actor_continuous(self): state_input = Input(shape=(NUM_STATE, )) advantage = Input(shape=(1, )) old_prediction = Input(shape=(NUM_ACTIONS, )) x = Dense(HIDDEN_SIZE, activation='tanh')(state_input) for _ in range(NUM_LAYERS - 1): x = Dense(HIDDEN_SIZE, activation='tanh')(x) out_actions = Dense(NUM_ACTIONS, name='output', activation='tanh')(x) model = Model(inputs=[state_input, advantage, old_prediction], outputs=[out_actions]) model.compile(optimizer=Adam(lr=LR), loss=[ self.proximal_policy_optimization_loss_continuous( advantage=advantage, old_prediction=old_prediction) ]) model.summary() return model def build_critic(self): state_input = Input(shape=(NUM_STATE, )) x = Dense(HIDDEN_SIZE, activation='tanh')(state_input) for _ in range(NUM_LAYERS - 1): x = Dense(HIDDEN_SIZE, activation='tanh')(x) out_value = Dense(1)(x) model = Model(inputs=[state_input], outputs=[out_value]) model.compile(optimizer=Adam(lr=LR), loss='mse') return model def reset_env(self): self.episode += 1 if self.episode % VALIDATION_EACH == 0: self.val = True else: self.val = False self.observation = self.env.reset() self.reward = [] def get_action(self): p = self.actor.predict([ self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION ]) if self.val is False: action = np.random.choice(NUM_ACTIONS, p=np.nan_to_num(p[0])) else: action = np.argmax(p[0]) action_matrix = np.zeros(NUM_ACTIONS) action_matrix[action] = 1 return action, action_matrix, p def get_action_continuous(self): p = self.actor.predict([ self.observation.reshape(1, NUM_STATE), DUMMY_VALUE, DUMMY_ACTION ]) if self.val is False: action = action_matrix = p[0] + np.random.normal( loc=0, scale=NOISE, size=p[0].shape) else: action = action_matrix = p[0] return action, action_matrix, p def transform_reward(self): for j in range(len(self.reward) - 2, -1, -1): self.reward[j] += self.reward[j + 1] * GAMMA def get_batch(self): """ Sometimes this rollout exceeds buffer size and thats normal. For example, buffer size is 250 but we don't observe any done's until 250. This rollout continues until we see a done(either a goal reach or time exceed done) This can be altered by counting a variable and checking that variable with buffer size. """ batch = [[], [], [], []] tmp_batch = [[], [], []] done = False untransformed_reward = [] while len(batch[0]) < BUFFER_SIZE: if CONTINUOUS is False: action, action_matrix, predicted_action = self.get_action() else: action, action_matrix, predicted_action = self.get_action_continuous( ) observation, reward, done = self.env.step(action) untransformed_reward.append(reward) if self.gradient_steps % RENDER_EACH == 0: self.env.render() self.reward.append(reward) tmp_batch[0].append(self.observation) tmp_batch[1].append(action_matrix) tmp_batch[2].append(predicted_action) self.observation = observation if done: self.transform_reward() if self.val is False: for i in range(len(tmp_batch[0])): obs, action, pred = tmp_batch[0][i], tmp_batch[1][ i], tmp_batch[2][i] r = self.reward[i] batch[0].append(obs) batch[1].append(action) batch[2].append(pred) batch[3].append(r) tmp_batch = [[], [], []] self.reset_env() obs, action, pred, reward = np.array(batch[0]), np.array(batch[1]), np.array(batch[2]), \ np.reshape(np.array(batch[3]), (len(batch[3]), 1)) pred = np.reshape(pred, (pred.shape[0], pred.shape[2])) return obs, action, pred, reward, untransformed_reward def run(self): # Note that in PPO, episodes are not counted, instead, we do a rollout of K steps and learn from that while self.episode < self.NUM_EPISODE: """ In the original code, these arrays are clipped to BUFFER_SIZE number of elements but I found out that this way it performs better so I updated this -Emir """ obs, action, pred, reward, untransformed_reward = self.get_batch() old_prediction = pred pred_values = self.critic.predict(obs) advantage = reward - pred_values if not self.testing: # advantage = (advantage - advantage.mean()) / advantage.std() actor_loss = self.actor.fit([obs, advantage, old_prediction], [action], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False) critic_loss = self.critic.fit([obs], [reward], batch_size=BATCH_SIZE, shuffle=True, epochs=EPOCHS, verbose=False) print("Gradient Update:", self.gradient_steps, " Reward: ", sum(untransformed_reward)) self.gradient_steps += 1 if not self.testing: self.save_weights("./weights") def save_weights(self, fpath): self.actor.save_weights( filepath=os.path.join(fpath, "actor_weights.h5")) self.critic.save_weights( filepath=os.path.join(fpath, "critic_weights.h5")) @staticmethod def proximal_policy_optimization_loss(advantage, old_prediction): def loss(y_true, y_pred): prob = y_true * y_pred old_prob = y_true * old_prediction r = prob / (old_prob + 1e-10) return -K.mean( K.minimum( r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantage) + ENTROPY_LOSS * -(prob * K.log(prob + 1e-10))) return loss @staticmethod def proximal_policy_optimization_loss_continuous(advantage, old_prediction): def loss(y_true, y_pred): var = K.square(NOISE) pi = 3.1415926 denom = K.sqrt(2 * pi * var) prob_num = K.exp(-K.square(y_true - y_pred) / (2 * var)) old_prob_num = K.exp(-K.square(y_true - old_prediction) / (2 * var)) prob = prob_num / denom old_prob = old_prob_num / denom r = prob / (old_prob + 1e-10) return -K.mean( K.minimum( r * advantage, K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantage)) return loss
os.makedirs("model") except FileExistsError: pass def update_plot(x, y): plt.cla() ax.plot(x, y) plt.pause(1e-4) fig.tight_layout() EPOCHS = int(2500) # maximum number of updates ENVIRONMENT = "Pong" if __name__ == "__main__": env = PongEnvironment(False) num_states = len(env.observe()) ppo = PPO(env, num_states=num_states, actions=np.arange(3)) rewards = list() steps_count = 0 eps = 0 for e in tqdm(range(1, EPOCHS + 1)): actions_set, avg_rews, steps, ep_count = ppo.update() steps_count += steps eps += ep_count rewards.append(avg_rews) if e % 10 == 0: x = range(0, len(rewards), 10) update_plot(x, [rewards[i] for i in x]) print(
def __init__(self, wid): self.wid = wid self.env = PongEnvironment() self.ppo = GLOBAL_PPO
class Worker(object): def __init__(self, wid): self.wid = wid self.env = PongEnvironment() self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) # if done: r = -10 buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done: if done: v_s_ = 0 # end of episode else: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, None] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if done: break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, )
import matplotlib.pyplot as plt import gym, threading, queue from Environment import PongEnvironment EP_MAX = 10000 EP_LEN = 10000 N_WORKER = 8 # parallel workers GAMMA = 0.99 # reward discount factor A_LR = 0.0001 # learning rate for actor C_LR = 0.0001 # learning rate for critic MIN_BATCH_SIZE = 512 # minimum batch size for updating PPO UPDATE_STEP = 100 # loop update operation n-steps EPSILON = 0.2 # for clipping surrogate objective GAME = 'MountainCar-v0' env = PongEnvironment() S_DIM = env.observation_space A_DIM = env.action_space class PPONet(object): def __init__(self): self.sess = tf.Session() self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') # critic w_init = tf.random_normal_initializer(0., .1) lc1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, kernel_initializer=w_init,