def __init__(self, batch_size, memory_capacity, num_episodes, learning_rate_drop_frame_limit, target_update_frequency, seeds=[104, 106, 108], discount=0.99, delta=1, model_name=None, visualize=False): self.env = CarEnvironment(seed=seeds) self.architecture = NeuralNet() self.explore_rate = Basic_Explore_Rate() self.learning_rate = Basic_Learning_Rate() self.model_path = os.path.dirname( os.path.realpath(__file__)) + '/models/' + model_name self.log_path = self.model_path + '/log' self.visualize = visualize self.damping_mult = 1 self.initialize_tf_variables() self.target_update_frequency = target_update_frequency self.discount = discount self.replay_memory = Replay_Memory(memory_capacity, batch_size) self.training_metadata = Training_Metadata( frame=0, frame_limit=learning_rate_drop_frame_limit, episode=0, num_episodes=num_episodes) self.delta = delta document_parameters(self)
def __init__(self): self.action_size = 3 self.state_size = 2000000000 self.qtable = np.zeros((self.state_size, self.action_size)) self.total_episodes = 10000 # Total episodes self.learning_rate = 0.8 # Learning rate self.max_steps = 10000 # Max steps per episode self.gamma = 0.95 # Discounting rate # Exploration parameters self.epsilon = 1.0 # Exploration rate self.max_epsilon = 1.0 # Exploration probability at start self.min_epsilon = 0.01 # Minimum exploration probability self.decay_rate = 0.005 # Exponential decay rate for exploration prob self.env = CarEnvironment() self.train()
def __init__(self): # Set to False to let the agent play self.training = False self.state_size = 5 self.action_size = 3 self.max_episodes = 500 self.learning_rate = 0.01 self.gamma = 0.95 self.init_networks() self.init_tensorboard() self.env = CarEnvironment() self.saver = tf.train.Saver() if self.training: self.train() else: self.play()
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) env = CarEnvironment() return env
class PGN(): def __init__(self): # Set to False to let the agent play self.training = False self.state_size = 5 self.action_size = 3 self.max_episodes = 500 self.learning_rate = 0.01 self.gamma = 0.95 self.init_networks() self.init_tensorboard() self.env = CarEnvironment() self.saver = tf.train.Saver() if self.training: self.train() else: self.play() """ Initialize all the networks """ def init_networks(self): with tf.name_scope("inputs"): self.input_ = tf.placeholder(tf.float32, [None, self.state_size], name="input_") self.actions = tf.placeholder(tf.int32, [None, self.action_size], name="actions") self.discounted_episode_rewards_ = tf.placeholder( tf.float32, [ None, ], name="discounted_episode_rewards") self.mean_reward_ = tf.placeholder(tf.float32, name="mean_reward") with tf.name_scope("fc1"): self.fc1 = tf.contrib.layers.fully_connected( inputs=self.input_, num_outputs=10, activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer()) with tf.name_scope("fc2"): self.fc2 = tf.contrib.layers.fully_connected( inputs=self.fc1, num_outputs=self.action_size, activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers.xavier_initializer()) with tf.name_scope("fc3"): self.fc3 = tf.contrib.layers.fully_connected( inputs=self.fc2, num_outputs=self.action_size, activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer()) with tf.name_scope("softmax"): self.action_distribution = tf.nn.softmax(self.fc3) with tf.name_scope("loss"): self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.fc3, labels=self.actions) self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_) with tf.name_scope("train"): self.train_opt = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss) """ Set up tensorboard """ def init_tensorboard(self): # Setup TensorBoard Writer self.writer = tf.summary.FileWriter("tensorboard/pg/1") ## Losses tf.summary.scalar("Loss", self.loss) ## Reward mean tf.summary.scalar("Reward_mean", self.mean_reward_) self.write_op = tf.summary.merge_all() def discount_and_normalize_rewards(self, episode_rewards): discounted_episode_rewards = np.zeros_like(episode_rewards) cumulative = 0.0 for i in reversed(range(len(episode_rewards))): cumulative = cumulative * self.gamma + episode_rewards[i] discounted_episode_rewards[i] = cumulative mean = np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) discounted_episode_rewards = (discounted_episode_rewards - mean) / (std) return discounted_episode_rewards def train(self): allRewards = [] total_rewards = 0 total_dist = 0 all_dist = [] maximumRewardRecorded = 0 episode = 0 episode_states, episode_actions, episode_rewards = [], [], [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for episode in range(self.max_episodes): episode_rewards_sum = 0 # Launch the game state = self.env.reset() while True: # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES. action_probability_distribution = sess.run( self.action_distribution, feed_dict={self.input_: state.reshape([1, 5])}) action = np.random.choice( range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel( )) # select action w.r.t the actions prob # Perform a new_state, reward, dist, done = self.env.step(action) total_dist += dist # Store s, a, r episode_states.append(state) # For actions because we output only one (the index) we need 2 (1 is for the action taken) # We need [0., 1.] (if we take right) not just the index action_ = np.zeros(self.action_size) action_[action] = 1 episode_actions.append(action_) episode_rewards.append(reward) if done: # Calculate sum reward episode_rewards_sum = np.sum(episode_rewards) allRewards.append(episode_rewards_sum) total_rewards = np.sum(allRewards) # Mean reward mean_reward = np.divide(total_rewards, episode + 1) maximumRewardRecorded = np.amax(allRewards) print("==========================================") print("Episode: ", episode) print("Reward: ", episode_rewards_sum) print("Mean Reward", mean_reward) print("Max reward so far: ", maximumRewardRecorded) # Calculate discounted reward discounted_episode_rewards = self.discount_and_normalize_rewards( episode_rewards) # Feedforward, gradient and backpropagation loss_, _ = sess.run( [self.loss, self.train_opt], feed_dict={ self.input_: np.vstack(np.array(episode_states)), self.actions: np.vstack(np.array(episode_actions)), self.discounted_episode_rewards_: discounted_episode_rewards }) # Write TF Summaries summary = sess.run( self.write_op, feed_dict={ self.input_: np.vstack(np.array(episode_states)), self.actions: np.vstack(np.array(episode_actions)), self.discounted_episode_rewards_: discounted_episode_rewards, self.mean_reward_: mean_reward }) self.writer.add_summary(summary, episode) self.writer.flush() # Reset the transition stores episode_states, episode_actions, episode_rewards = [],[],[] all_dist.append([episode, total_dist]) total_dist = 0 break state = new_state # Save Model self.saver.save(sess, "./models/model.ckpt") print("Model saved") a = np.asarray(all_dist) np.savetxt("test.csv", a, delimiter=',') def play(self): with tf.Session() as sess: self.env.reset() rewards = [] # Load the model self.saver.restore(sess, "./models/model.ckpt") for episode in range(10): state = self.env.reset() step = 0 done = False total_rewards = 0 print("****************************************************") print("EPISODE ", episode) while True: # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES. action_probability_distribution = sess.run( self.action_distribution, feed_dict={self.input_: state.reshape([1, 5])}) #print(action_probability_distribution) action = np.random.choice( range(action_probability_distribution.shape[1]), p=action_probability_distribution.ravel( )) # select action w.r.t the actions prob new_state, reward, dist, done = self.env.step(action) total_rewards += reward if done: rewards.append(total_rewards) print("Score", total_rewards) break state = new_state print("Score over time: " + str(sum(rewards) / 10))
processed_batch = batch.astype('float32') / 255. return processed_batch def process_reward(self, reward): return np.clip(reward, -1., 1.) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=['train', 'test'], default='train') parser.add_argument('--env-name', type=str, default='BreakoutDeterministic-v4') parser.add_argument('--weights', type=str, default=None) args = parser.parse_args() # Get the environment and extract the number of actions. env = CarEnvironment() np.random.seed(123) nb_actions = env.action_space.n # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE model = Sequential() if K.image_dim_ordering() == 'tf': # (width, height, channels) model.add(Permute((2, 3, 1), input_shape=input_shape)) elif K.image_dim_ordering() == 'th': # (channels, width, height) model.add(Permute((1, 2, 3), input_shape=input_shape)) else: raise RuntimeError('Unknown image_dim_ordering.') model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
class CarAgent: def __init__(self, batch_size, memory_capacity, num_episodes, learning_rate_drop_frame_limit, target_update_frequency, seeds=[104, 106, 108], discount=0.99, delta=1, model_name=None, visualize=False): self.env = CarEnvironment(seed=seeds) self.architecture = NeuralNet() self.explore_rate = Basic_Explore_Rate() self.learning_rate = Basic_Learning_Rate() self.model_path = os.path.dirname( os.path.realpath(__file__)) + '/models/' + model_name self.log_path = self.model_path + '/log' self.visualize = visualize self.damping_mult = 1 self.initialize_tf_variables() self.target_update_frequency = target_update_frequency self.discount = discount self.replay_memory = Replay_Memory(memory_capacity, batch_size) self.training_metadata = Training_Metadata( frame=0, frame_limit=learning_rate_drop_frame_limit, episode=0, num_episodes=num_episodes) self.delta = delta document_parameters(self) # sets up tensorflow graph - called in setup def initialize_tf_variables(self): # Setting up game specific variables self.state_size = self.env.state_space_size self.action_size = self.env.action_space_size self.state_shape = self.env.state_shape self.q_grid = None # Tf placeholders - feeds data into neural net from outside self.state_tf = tf.placeholder(shape=self.state_shape, dtype=tf.float32, name='state_tf') self.action_tf = tf.placeholder(shape=[None, self.action_size], dtype=tf.float32, name='action_tf') self.y_tf = tf.placeholder(dtype=tf.float32, name='y_tf') self.alpha = tf.placeholder(dtype=tf.float32, name='alpha') self.test_score = tf.placeholder(dtype=tf.float32, name='test_score') self.avg_q = tf.placeholder(dtype=tf.float32, name='avg_q') # Keep track of episode and frames # Variables are used to store information about neural net self.episode = tf.Variable(initial_value=0, trainable=False, name='episode') self.frames = tf.Variable(initial_value=0, trainable=False, name='frames') self.increment_frames_op = tf.assign(self.frames, self.frames + 1, name='increment_frames_op') self.increment_episode_op = tf.assign(self.episode, self.episode + 1, name='increment_episode_op') # Operations # NAME DESCRIPTION FEED DEPENDENCIES # Q_value Value of Q at given state(s) state_tf # Q_argmax Action(s) maximizing Q at given state(s) state_tf # Q_amax Maximal action value(s) at given state(s) state_tf # Q_value_at_action Q value at specific (action, state) pair(s) state_tf, action_tf # onehot_greedy_action One-hot encodes greedy action(s) at given state(s) state_tf self.Q_value = self.architecture.evaluate(self.state_tf, self.action_size) self.Q_argmax = tf.argmax(self.Q_value, axis=1, name='Q_argmax') self.Q_amax = tf.reduce_max(self.Q_value, axis=1, name='Q_max') self.Q_value_at_action = tf.reduce_sum(tf.multiply( self.Q_value, self.action_tf), axis=1, name='Q_value_at_action') self.onehot_greedy_action = tf.one_hot(self.Q_argmax, depth=self.action_size) # Training related # NAME FEED DEPENDENCIES # loss y_tf, state_tf, action_tf # train_op y_tf, state_tf, action_tf, alpha self.loss = tf.losses.huber_loss(self.y_tf, self.Q_value_at_action) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.alpha) self.train_op = self.optimizer.minimize(self.loss, name='train_minimize') # Tensorflow session setup self.saver = tf.train.Saver(max_to_keep=None) config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = False config.log_device_placement = False self.sess = tf.Session(config=config) self.trainable_variables = tf.trainable_variables() print(self.trainable_variables) # Tensorboard setup self.writer = tf.summary.FileWriter(self.log_path) self.writer.add_graph(self.sess.graph) test_score = tf.summary.scalar("Training score", self.test_score, collections=None, family=None) avg_q = tf.summary.scalar("Average Q-value", self.avg_q, collections=None, family=None) self.training_summary = tf.summary.merge([avg_q]) self.test_summary = tf.summary.merge([test_score]) # subprocess.Popen(['tensorboard', '--logdir', self.log_path]) # Initialising variables and finalising graph self.sess.run(tf.global_variables_initializer()) self.fixed_target_weights = self.sess.run(self.trainable_variables) self.sess.graph.finalize() # Performs one step of batch gradient descent on the DDQN loss function. # alpha = learning rate def experience_replay(self, alpha): state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_memory.get_mini_batch( self.training_metadata) # get argmax of q-network greedy_actions = self.sess.run( self.onehot_greedy_action, feed_dict={self.state_tf: next_state_batch}) y_batch = [None] * self.replay_memory.batch_size fixed_feed_dict = { self.state_tf: next_state_batch, self.action_tf: greedy_actions } fixed_feed_dict.update( zip(self.trainable_variables, self.fixed_target_weights)) # fixed_feed_dict.update() Q_batch = self.sess.run(self.Q_value_at_action, feed_dict=fixed_feed_dict) y_batch = reward_batch + self.discount * np.multiply( np.invert(done_batch), Q_batch) feed = { self.state_tf: state_batch, self.action_tf: action_batch, self.y_tf: y_batch, self.alpha: alpha } self.sess.run(self.train_op, feed_dict=feed) # Updates weights of target network def update_fixed_target_weights(self): self.fixed_target_weights = self.sess.run(self.trainable_variables) # Trains the model def train(self, imitation=False): while self.sess.run( self.episode) < self.training_metadata.num_episodes: #basically grapb the episode number from the neural net episode = self.sess.run(self.episode) self.training_metadata.increment_episode() # increments the episode in the neural net self.sess.run(self.increment_episode_op) # set up car environment state_lazy = self.env.reset() self.env.render() done = False epsilon = self.explore_rate.get(self.training_metadata) alpha = self.learning_rate.get(self.training_metadata) print("Episode {0}/{1} \t Epsilon: {2} \t Alpha: {3}".format( episode, self.training_metadata.num_episodes, epsilon, alpha)) print("Replay Memory: %d" % self.replay_memory.length()) episode_frame = 0 max_reward = float('-inf') while True: # Update target weights every update frequency if self.training_metadata.frame % self.target_update_frequency == 0 and ( self.training_metadata.frame != 0): self.update_fixed_target_weights() # Choose and perform action and update replay memory if random.random() < epsilon: if imitation: action = self.get_oracle_action(self.env) else: action = self.env.sample_action_space() else: action = self.get_action(np.array(state_lazy), 0) next_state_lazy, reward, done, info = self.env.step(action) if self.visualize: self.env.render() episode_frame += 1 self.replay_memory.add(self, state_lazy, action, reward, next_state_lazy, done) # Train with replay memory if populated if self.replay_memory.length( ) > 10 * self.replay_memory.batch_size: self.sess.run(self.increment_frames_op) self.training_metadata.increment_frame() self.experience_replay(alpha) avg_q = self.estimate_avg_q() state_lazy = next_state_lazy done = info['true_done'] abs_reward = self.env.get_total_reward() max_reward = max(max_reward, abs_reward) if max_reward - abs_reward > 5 or done: print("Episode reward:", abs_reward) break # Saving tensorboard data and model weights if (episode % 30 == 0) and (episode != 0): score, std, rewards = self.test(num_test_episodes=5, visualize=self.visualize) print('{0} +- {1}'.format(score, std)) self.writer.add_summary( self.sess.run(self.test_summary, feed_dict={self.test_score: score}), episode / 30) self.saver.save(self.sess, self.model_path + '/data.chkp', global_step=self.training_metadata.episode) file = open(self.model_path + '/trainlog.txt', "a+") printstr = '%f %f %f %f %f \n' % (score, std, episode, alpha, epsilon) file.write(printstr) file.close() self.writer.add_summary( self.sess.run(self.training_summary, feed_dict={self.avg_q: avg_q}), episode) # Chooses action wrt an e-greedy policy. # - state Tensor representing a single state # - epsilon Number in (0,1) # Output Integer in the range 0...self.action_size-1 representing an action def get_action(self, state, epsilon): # Performing epsilon-greedy action selection if random.random() < epsilon: return self.env.sample_action_space() else: return self.sess.run(self.Q_argmax, feed_dict={self.state_tf: [state]})[0] def get_oracle_action(self, env): env = env.env a = 4 car_x = env.car.hull.position[0] car_y = env.car.hull.position[1] car_angle = -env.car.hull.angle car_vel = np.linalg.norm(env.car.hull.linearVelocity) target_seg = 0 for i in range(len(env.road)): if not env.road[i].road_visited: target_seg = min(i + 3, len(env.road) - 1) break target_loc = env.nav_tiles[target_seg] #env.highlight_loc = target_loc angle_to = np.arctan2(target_loc[0] - car_x, target_loc[1] - car_y) - car_angle angle_to = (angle_to + 2 * np.pi) % (2 * np.pi) if angle_to > np.pi: angle_to -= 2 * np.pi vel_err = 35 - car_vel if vel_err > 2: a = 2 if angle_to < -0.15 * self.damping_mult: a = 0 if angle_to > 0.15 * self.damping_mult: a = 1 if a == 4: self.damping_mult /= 1.5 self.damping_mult = max(self.damping_mult, 1) else: self.damping_mult *= 1.2 return a # Tests the model def test(self, num_test_episodes, visualize): rewards = [] for episode in range(num_test_episodes): done = False state_lazy = self.env.reset(test=True) #input() self.env.render() state = np.array(state_lazy) episode_reward = 0 max_reward = float('-inf') while not done: if visualize: self.env.render() action = self.get_action(state, epsilon=0) next_state_lazy, reward, done, info = self.env.step(action, test=True) state = np.array(next_state_lazy) episode_reward += reward done = info['true_done'] if (self.env.env.t > 30): print("Ended due to time limit") done = True rewards.append(episode_reward) print(episode_reward) return np.mean(rewards), np.std(rewards), rewards # average Q-value over some number of fixed tracks def estimate_avg_q(self): if not self.q_grid: return 0 return np.average( np.amax(self.sess.run(self.Q_value, feed_dict={self.state_tf: self.q_grid}), axis=1)) # loads a model trained in a previous session # - path: String, giving the path to the checkpoint file to be loaded def load(self, path): self.saver.restore(self.sess, path)
class QN(): def __init__(self): self.action_size = 3 self.state_size = 2000000000 self.qtable = np.zeros((self.state_size, self.action_size)) self.total_episodes = 10000 # Total episodes self.learning_rate = 0.8 # Learning rate self.max_steps = 10000 # Max steps per episode self.gamma = 0.95 # Discounting rate # Exploration parameters self.epsilon = 1.0 # Exploration rate self.max_epsilon = 1.0 # Exploration probability at start self.min_epsilon = 0.01 # Minimum exploration probability self.decay_rate = 0.005 # Exponential decay rate for exploration prob self.env = CarEnvironment() self.train() def train(self): rewards = [] all_dist = [] total_dist = 0 # 2 For life or until learning is stopped for episode in range(self.total_episodes): # Reset the environment state = self.env.reset() state = 0 step = 0 done = False total_rewards = 0 for step in range(self.max_steps): # 3. Choose an action a in the current world state (s) ## First we randomize a number exp_exp_tradeoff = random.uniform(0, 1) ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state) if exp_exp_tradeoff > self.epsilon: action = np.argmax(self.qtable[state, :]) # Else doing a random choice --> exploration else: action = random.randint(0, 2) # Take the action (a) and observe the outcome state(s') and reward (r) s, reward, dist, done = self.env.step(action) x = "" for i in np.array(s).flatten(): x += str(int(round(i))) print("State: " + x) new_state = int(x) # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] # qtable[new_state,:] : all the actions we can take from new state self.qtable[state, action] = self.qtable[ state, action] + self.learning_rate * ( reward + self.gamma * np.max(self.qtable[new_state, :]) - self.qtable[state, action]) total_rewards += reward total_dist += dist # Our new state is state state = new_state # If done (if we're dead) : finish episode if done == True: print(f"Ep ended: {episode}") break # Reduce epsilon (because we need less and less exploration) self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp( -self.decay_rate * episode) rewards.append(total_rewards) all_dist.append(total_dist) a = np.asarray(all_dist) np.savetxt("ql.csv", a, delimiter=',')