def __init__(self, env): # Hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 64 self.BUFFER_SIZE = 20000 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = []
def __init__(self, env, track, episodes=650): self.env = env self.track = track self.max_episodes = episodes self.max_steps = 3000 self.save_model = True self.load_model = False self.restart_memory_leak = 25 ### size of action- and state space self.state_size = 70 self.action_size = 3 ### DDPG Hyperparameters self.epsilon = 1.0 self.epsilon_decay = 1 / 96000 self.epsilon_min = 0.07 self.batch_size = 64 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.00011 self.lr_critic = 0.0011 ### set OU Process self.ou = OU() ### tf gpu and session set config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) K.set_session(self.sess) ### actor, critic and replay memory self.actor = Actor(self.sess, self.state_size, self.action_size, self.tau, self.lr_actor) self.critic = Critic(self.sess, self.state_size, self.action_size, self.tau, self.lr_critic) self.memory = ExperienceReplayBuffer(50000) ### helper class to build state representation self.dataset_builder = DatasetBuilder()
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score self.score = 0 self.count = 0 self.best_score = -np.inf
def __init__(self, env): self.sess = tf.Session() K.set_session(self.sess) ## hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 128 self.BUFFER_SIZE = 20000 self.MIN_SAMPLES_TO_BEGIN_LEARNING = 1000 self.ACTOR_LEARNING_RATE = 0.001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.sess, self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize for later gradient calculation self.sess.run( tf.global_variables_initializer()) #<-- no problem without it ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = []
class DDPGAgent: def __init__(self, env, track, episodes=650): self.env = env self.track = track self.max_episodes = episodes self.max_steps = 3000 self.save_model = True self.load_model = False self.restart_memory_leak = 25 ### size of action- and state space self.state_size = 70 self.action_size = 3 ### DDPG Hyperparameters self.epsilon = 1.0 self.epsilon_decay = 1 / 96000 self.epsilon_min = 0.07 self.batch_size = 64 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.00011 self.lr_critic = 0.0011 ### set OU Process self.ou = OU() ### tf gpu and session set config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) K.set_session(self.sess) ### actor, critic and replay memory self.actor = Actor(self.sess, self.state_size, self.action_size, self.tau, self.lr_actor) self.critic = Critic(self.sess, self.state_size, self.action_size, self.tau, self.lr_critic) self.memory = ExperienceReplayBuffer(50000) ### helper class to build state representation self.dataset_builder = DatasetBuilder() def saveModel(self): self.actor.model.save("./ddpg_weights/ddpg_actor_model.h5") self.critic.model.save("./ddpg_weights/ddpg_critic_model.h5") def lowerExploration(self): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay def trainAgent(self): all_total_rewards = [] all_dist_raced = [] all_dist_percentage = [] all_avg_speed = [] all_car_hits = [] all_race_pos = [] for e in range(self.max_episodes): ### save weights every 10th episode if self.save_model: if (e % 10) == 0: self.saveModel() ### relaunch torcs every 10th episode because ### leaky memory would otherwise slow thread down if (e % self.restart_memory_leak) == 0: state = self.env.reset(relaunch=True) else: state = self.env.reset() ### build state representation state, _ = self.dataset_builder.buildStateDataSet(s=state) total_reward = 0 avg_speed = 0 avg_racepos = 0 damage = 0 damage_hit_counter = 0 for j in range(self.max_steps): ### initialize numpy matrices to hold action values with OU noise action_with_noise = np.zeros([1, self.action_size]) noise = np.zeros([1, self.action_size]) ### get action values from actor action = self.actor.model.predict( state.reshape(1, state.shape[0])) ################################################################### ### Deriving OU-Parameters from ### ### https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html ### ### and own experiment ### ################################################################### noise[0][0] = self.epsilon * self.ou.calc_noise( action[0][0], 0.0, 0.55, 0.15) noise[0][1] = self.epsilon * self.ou.calc_noise( action[0][1], 0.55, 1.00, 0.10) noise[0][2] = self.epsilon * self.ou.calc_noise( action[0][2], -0.1, 1.00, 0.05) ################################################################### ### Concept of a "stochastic" break adapted and improved from ### ### https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html ### ### The issue is that slamming the break all the ### ### time isn't adequatly represented in the ### ### reward function. Therefore we "hack" the OU-Process ### ### by triggering the brake with a chance of ### ### min(0.18, self.epsilon) ### ################################################################### if random.random() <= min(0.18, self.epsilon): noise[0][2] = self.epsilon * self.ou.calc_noise( action[0][2], 0.25, 1.00, 0.10) ### Add OU noise to actions action_with_noise[0][0] = action[0][0] + noise[0][0] action_with_noise[0][1] = action[0][1] + noise[0][1] action_with_noise[0][2] = action[0][2] + noise[0][2] next_state, reward, done, info = self.env.step( action_with_noise[0]) ### build state representation dist_raced = next_state.distRaced speedX = next_state.speedX pre_damage = damage damage = next_state.damage racePos = next_state.racePos next_state = np.hstack( (next_state.angle, next_state.track, next_state.focus, next_state.opponents, next_state.trackPos, next_state.speedX, next_state.speedY, next_state.speedZ, next_state.wheelSpinVel / 100.0, next_state.rpm)) ### save to experience replay memory for batch selection self.memory.memorize(state, action_with_noise[0], reward, next_state, done) ### lower epsilon for less exploration self.lowerExploration() ### train the models! self.trainModel() total_reward += reward avg_speed += speedX avg_racepos += racePos state = next_state ### detect damange if damage - pre_damage > 0: damage_hit_counter += 1 print("Episode: " + str(e) + " Step: " + str(j) + " Action: " + str(action_with_noise) + " Reward: " + str(reward) + " Epsilon: " + str(self.epsilon)) if done: all_total_rewards.append(total_reward) all_dist_raced.append(dist_raced) ### use track length according to chosen track if self.track == "eroad": track_length = 3260 elif self.track == "cgspeedway": track_length = 2057 elif self.track == "forza": track_length = 5784 percentage_of_track = round( ((dist_raced / track_length) * 100), 0) ### in case agent completed multiple laps which is likely for a well trained agent if percentage_of_track > 100: percentage_of_track = 100 all_dist_percentage.append(percentage_of_track) all_avg_speed.append((avg_speed / j)) all_car_hits.append(damage_hit_counter) all_race_pos.append(int(avg_racepos / j)) break self.env.end() ### All the plotting stuff print("Plotting rewards!") plt.plot(all_total_rewards) plt.xlabel("Episode") plt.ylabel("Ertrag") plt.show() print("Plotting distances!") plt.plot(all_dist_raced) plt.xlabel("Episode") plt.ylabel("Distanz von Startlinie [m]") plt.show() print("Plotting completeness!") plt.plot(all_dist_percentage) plt.xlabel("Episode") plt.ylabel("Vollstaendigkeit Strecke [%]") plt.axis([0, 350, 0, 100]) plt.show() print("Plotting avg speed!") plt.plot(all_avg_speed) plt.xlabel("Episode") plt.ylabel("Durschn. Geschwindigkeit [km/h]") plt.axis([0, 350, 0, 1]) plt.show() print("Plotting car hits!") plt.plot(all_car_hits) plt.xlabel("Episode") plt.ylabel("Unfaelle des Fahrzeuges") plt.show() print("Mean car hits:") print(sum(all_car_hits) / len(all_car_hits)) print("Std dev car hits:") print(np.std(all_car_hits)) print("Plotting car hits per distance!") div = np.divide(all_car_hits, all_dist_raced) plt.plot(div) plt.xlabel("Episode") plt.ylabel("Unfaelle des Fahrzeuges pro Distanzeinheit") plt.show() print("Plotting avg race pos!") plt.plot(all_race_pos) plt.xlabel("Episode") plt.ylabel("Durschn. Position") plt.show() def trainModel(self): ### get random mini batch from experience replay memory mini_batch = self.memory.sampleRandomBatch(self.batch_size) ### build arrays for models from mini batch states = np.asarray([b[0] for b in mini_batch]) actions = np.asarray([b[1] for b in mini_batch]) target = np.asarray([b[1] for b in mini_batch]) rewards = np.asarray([b[2] for b in mini_batch]) new_states = np.asarray([b[3] for b in mini_batch]) dones = np.asarray([b[4] for b in mini_batch]) ### get q values from target critic model ### q(s, t(s), w') in thesis target_q_values = self.critic.target_model.predict( [new_states, self.actor.target_model.predict(new_states)]) ### iterate through minibatch, update target according to bellman eq. for k in range(0, len(mini_batch)): if dones[k]: target[k] = rewards[k] else: target[k] = rewards[k] + self.gamma * target_q_values[k] ### train networks self.critic.model.train_on_batch([states, actions], target) actions = self.actor.model.predict(states) ### nabla q(s, t(s)) gradients = self.critic.gradients(states, actions) ### train actor self.actor.train(states, gradients) ### soft update self.actor.target_train() self.critic.target_train() def testAgent(self): ### set epsilon (exploration) low self.epsilon = self.epsilon_min ### Do not save weights when testing ### CHANGE if you want to continuously train agent self.save_model = False try: self.actor.model = load_model("./ddpg_weights/ddpg_actor_model.h5") self.critic.model = load_model( "./ddpg_weights/ddpg_critic_model.h5") print("Model loaded!") except: print("Model could not be loaded! Check path or train first") sys.exit() self.trainAgent()
class DDPGagent(object): def __init__(self, env): self.sess = tf.Session() K.set_session(self.sess) ## hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 64 self.BUFFER_SIZE = 20000 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.sess, self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize for later gradient calculation self.sess.run( tf.global_variables_initializer()) #<-- no problem without it ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = [] ## Ornstein Uhlenbeck Noise def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1): return x + rho * ( mu - x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim) ## computing TD target: y_k = r_k + gamma*Q(s_k+1, a_k+1) def td_target(self, rewards, q_values, dones): y_k = np.asarray(q_values) for i in range(q_values.shape[0]): # number of batch if dones[i]: y_k[i] = rewards[i] else: y_k[i] = rewards[i] + self.GAMMA * q_values[i] return y_k ## train the agent def train(self, max_episode_num): # initial transfer model weights to target model network self.actor.update_target_network() self.critic.update_target_network() for ep in range(int(max_episode_num)): # reset OU noise pre_noise = np.zeros(self.action_dim) # reset episode time, episode_reward, done = 0, 0, False # reset the environment and observe the first state state = self.env.reset() while not done: # visualize the environment #self.env.render() # pick an action: shape = (1,) action = self.actor.predict(state) noise = self.ou_noise(pre_noise, dim=self.action_dim) # clip continuous action to be within action_bound action = np.clip(action + noise, -self.action_bound, self.action_bound) # observe reward, new_state next_state, reward, done, _ = self.env.step(action) # add transition to replay buffer train_reward = (reward + 8) / 8 self.buffer.add_buffer(state, action, train_reward, next_state, done) if self.buffer.buffer_size > 1000: # start train after buffer has some amounts # sample transitions from replay buffer states, actions, rewards, next_states, dones = self.buffer.sample_batch( self.BATCH_SIZE) # predict target Q-values target_qs = self.critic.target_predict( [next_states, self.actor.target_predict(next_states)]) # compute TD targets y_i = self.td_target(rewards, target_qs, dones) # train critic using sampled batch self.critic.train_on_batch(states, actions, y_i) # Q gradient wrt current policy s_actions = self.actor.model.predict( states) # shape=(batch, 1), # caution: NOT self.actor.predict ! # self.actor.model.predict(state) -> shape=(1,1) # self.actor.predict(state) -> shape=(1,) -> type of gym action s_grads = self.critic.dq_da(states, s_actions) dq_das = np.array(s_grads).reshape((-1, self.action_dim)) # train actor self.actor.train(states, dq_das) # update both target network self.actor.update_target_network() self.critic.update_target_network() # update current state pre_noise = noise state = next_state episode_reward += reward time += 1 ## display rewards every episode print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ', episode_reward) self.save_epi_reward.append(episode_reward) ## save weights every episode #print('Now save') self.actor.save_weights("./save_weights/pendulum_actor.h5") self.critic.save_weights("./save_weights/pendulum_critic.h5") np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward) print(self.save_epi_reward) ## save them to file if done def plot_result(self): plt.plot(self.save_epi_reward) plt.show()
class DDPG_Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score self.score = 0 self.count = 0 self.best_score = -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.total_reward = 0 self.count = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.total_reward += reward self.count += 1 def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) if self.count > 0: self.score = self.total_reward / float(self.count) if self.score > self.best_score: self.best_score = self.score else: self.score = 0 def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)