def train(): # sets train and evaluate data array train_data = [] evaluate_data = [] # sets the labels on the matrix fig, ax = plt.subplots() plt.imshow(np.zeros((N_RESULTS, N_RESULTS))) ax.set_xticklabels(["0"] + list(TYPES.keys())) ax.set_yticklabels(["0"] + list(TYPES.keys())) plt.setp(ax.get_xticklabels(), rotation=10) # runs through the file and creates an evaluate array [[data], result] try: with open("segmentation.data", "r") as arq: for line in arq: line_vector = line.split(",") evaluate_data.append([[ float(line_vector[number]) / MAX_DATA_VALUE for number in range(1, len(line_vector)) ], TYPES[line_vector[0]]]) except: loggin.critical('segmentation data not found.') sys.exit() # runs through the file and creates a train array [[data], result] try: with open("segmentation.test", "r") as arq: for line in arq: line_vector = line.split(",") train_data.append([[ float(line_vector[number]) / MAX_DATA_VALUE for number in range(1, len(line_vector)) ], TYPES[line_vector[0]]]) except: loggin.critical('segmentation data not found.') sys.exit() # creates the neural net neural_net = NeuralNet(N_ENTRIES, N_INTERMIDIATE_LAYERS, INTERMIDIATE_LAYER_SIZE, N_EXITS) # creates the confusion matrix confusion_matrix = np.zeros((N_RESULTS, N_RESULTS)) # creates the percent bar print('Epochs:') bar_epochs = progressbar.ProgressBar(widgets=PROGRESS_BAR_WIDGETS, max_value=N_EPOCHS) # starts the epochs loop for epoch in range(N_EPOCHS): confusion_matrix = np.zeros((N_RESULTS, N_RESULTS)) # runs through the train data and feeds the neural net for item in train_data: train_in = item[0] expected = item[1] train_exit = neural_net.train(MOMENTUM, LEARNING_RATIO, train_in, expected) confusion_matrix[expected[0].tolist().index( 1)] += train_exit[0].tolist() bar_epochs.update(epoch) # every 20 epochs, update the visualization, and saves both the neural net and the confusion matrix if (epoch % 20 == 0): confusion_matrix = np.zeros((N_RESULTS, N_RESULTS)) for item in evaluate_data: evaluate_in = item[0] expected = item[1] evaluate_exit = neural_net.evaluate(evaluate_in) confusion_matrix[expected[0].tolist().index( 1)] += evaluate_exit[0].tolist() with open('results/neural_net_model.pkl', 'wb') as neural_net_file: pickle.dump(neural_net, neural_net_file) with open('results/confusion_matrix.pkl', 'wb') as matrix_file: pickle.dump(confusion_matrix, matrix_file) plt.imshow(confusion_matrix) plt.pause(1) plt.savefig('final_confusion_matrix.png')
class CarAgent: def __init__(self, batch_size, memory_capacity, num_episodes, learning_rate_drop_frame_limit, target_update_frequency, seeds=[104, 106, 108], discount=0.99, delta=1, model_name=None, visualize=False): self.env = CarEnvironment(seed=seeds) self.architecture = NeuralNet() self.explore_rate = Basic_Explore_Rate() self.learning_rate = Basic_Learning_Rate() self.model_path = os.path.dirname( os.path.realpath(__file__)) + '/models/' + model_name self.log_path = self.model_path + '/log' self.visualize = visualize self.damping_mult = 1 self.initialize_tf_variables() self.target_update_frequency = target_update_frequency self.discount = discount self.replay_memory = Replay_Memory(memory_capacity, batch_size) self.training_metadata = Training_Metadata( frame=0, frame_limit=learning_rate_drop_frame_limit, episode=0, num_episodes=num_episodes) self.delta = delta document_parameters(self) # sets up tensorflow graph - called in setup def initialize_tf_variables(self): # Setting up game specific variables self.state_size = self.env.state_space_size self.action_size = self.env.action_space_size self.state_shape = self.env.state_shape self.q_grid = None # Tf placeholders - feeds data into neural net from outside self.state_tf = tf.placeholder(shape=self.state_shape, dtype=tf.float32, name='state_tf') self.action_tf = tf.placeholder(shape=[None, self.action_size], dtype=tf.float32, name='action_tf') self.y_tf = tf.placeholder(dtype=tf.float32, name='y_tf') self.alpha = tf.placeholder(dtype=tf.float32, name='alpha') self.test_score = tf.placeholder(dtype=tf.float32, name='test_score') self.avg_q = tf.placeholder(dtype=tf.float32, name='avg_q') # Keep track of episode and frames # Variables are used to store information about neural net self.episode = tf.Variable(initial_value=0, trainable=False, name='episode') self.frames = tf.Variable(initial_value=0, trainable=False, name='frames') self.increment_frames_op = tf.assign(self.frames, self.frames + 1, name='increment_frames_op') self.increment_episode_op = tf.assign(self.episode, self.episode + 1, name='increment_episode_op') # Operations # NAME DESCRIPTION FEED DEPENDENCIES # Q_value Value of Q at given state(s) state_tf # Q_argmax Action(s) maximizing Q at given state(s) state_tf # Q_amax Maximal action value(s) at given state(s) state_tf # Q_value_at_action Q value at specific (action, state) pair(s) state_tf, action_tf # onehot_greedy_action One-hot encodes greedy action(s) at given state(s) state_tf self.Q_value = self.architecture.evaluate(self.state_tf, self.action_size) self.Q_argmax = tf.argmax(self.Q_value, axis=1, name='Q_argmax') self.Q_amax = tf.reduce_max(self.Q_value, axis=1, name='Q_max') self.Q_value_at_action = tf.reduce_sum(tf.multiply( self.Q_value, self.action_tf), axis=1, name='Q_value_at_action') self.onehot_greedy_action = tf.one_hot(self.Q_argmax, depth=self.action_size) # Training related # NAME FEED DEPENDENCIES # loss y_tf, state_tf, action_tf # train_op y_tf, state_tf, action_tf, alpha self.loss = tf.losses.huber_loss(self.y_tf, self.Q_value_at_action) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.alpha) self.train_op = self.optimizer.minimize(self.loss, name='train_minimize') # Tensorflow session setup self.saver = tf.train.Saver(max_to_keep=None) config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = False config.log_device_placement = False self.sess = tf.Session(config=config) self.trainable_variables = tf.trainable_variables() print(self.trainable_variables) # Tensorboard setup self.writer = tf.summary.FileWriter(self.log_path) self.writer.add_graph(self.sess.graph) test_score = tf.summary.scalar("Training score", self.test_score, collections=None, family=None) avg_q = tf.summary.scalar("Average Q-value", self.avg_q, collections=None, family=None) self.training_summary = tf.summary.merge([avg_q]) self.test_summary = tf.summary.merge([test_score]) # subprocess.Popen(['tensorboard', '--logdir', self.log_path]) # Initialising variables and finalising graph self.sess.run(tf.global_variables_initializer()) self.fixed_target_weights = self.sess.run(self.trainable_variables) self.sess.graph.finalize() # Performs one step of batch gradient descent on the DDQN loss function. # alpha = learning rate def experience_replay(self, alpha): state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_memory.get_mini_batch( self.training_metadata) # get argmax of q-network greedy_actions = self.sess.run( self.onehot_greedy_action, feed_dict={self.state_tf: next_state_batch}) y_batch = [None] * self.replay_memory.batch_size fixed_feed_dict = { self.state_tf: next_state_batch, self.action_tf: greedy_actions } fixed_feed_dict.update( zip(self.trainable_variables, self.fixed_target_weights)) # fixed_feed_dict.update() Q_batch = self.sess.run(self.Q_value_at_action, feed_dict=fixed_feed_dict) y_batch = reward_batch + self.discount * np.multiply( np.invert(done_batch), Q_batch) feed = { self.state_tf: state_batch, self.action_tf: action_batch, self.y_tf: y_batch, self.alpha: alpha } self.sess.run(self.train_op, feed_dict=feed) # Updates weights of target network def update_fixed_target_weights(self): self.fixed_target_weights = self.sess.run(self.trainable_variables) # Trains the model def train(self, imitation=False): while self.sess.run( self.episode) < self.training_metadata.num_episodes: #basically grapb the episode number from the neural net episode = self.sess.run(self.episode) self.training_metadata.increment_episode() # increments the episode in the neural net self.sess.run(self.increment_episode_op) # set up car environment state_lazy = self.env.reset() self.env.render() done = False epsilon = self.explore_rate.get(self.training_metadata) alpha = self.learning_rate.get(self.training_metadata) print("Episode {0}/{1} \t Epsilon: {2} \t Alpha: {3}".format( episode, self.training_metadata.num_episodes, epsilon, alpha)) print("Replay Memory: %d" % self.replay_memory.length()) episode_frame = 0 max_reward = float('-inf') while True: # Update target weights every update frequency if self.training_metadata.frame % self.target_update_frequency == 0 and ( self.training_metadata.frame != 0): self.update_fixed_target_weights() # Choose and perform action and update replay memory if random.random() < epsilon: if imitation: action = self.get_oracle_action(self.env) else: action = self.env.sample_action_space() else: action = self.get_action(np.array(state_lazy), 0) next_state_lazy, reward, done, info = self.env.step(action) if self.visualize: self.env.render() episode_frame += 1 self.replay_memory.add(self, state_lazy, action, reward, next_state_lazy, done) # Train with replay memory if populated if self.replay_memory.length( ) > 10 * self.replay_memory.batch_size: self.sess.run(self.increment_frames_op) self.training_metadata.increment_frame() self.experience_replay(alpha) avg_q = self.estimate_avg_q() state_lazy = next_state_lazy done = info['true_done'] abs_reward = self.env.get_total_reward() max_reward = max(max_reward, abs_reward) if max_reward - abs_reward > 5 or done: print("Episode reward:", abs_reward) break # Saving tensorboard data and model weights if (episode % 30 == 0) and (episode != 0): score, std, rewards = self.test(num_test_episodes=5, visualize=self.visualize) print('{0} +- {1}'.format(score, std)) self.writer.add_summary( self.sess.run(self.test_summary, feed_dict={self.test_score: score}), episode / 30) self.saver.save(self.sess, self.model_path + '/data.chkp', global_step=self.training_metadata.episode) file = open(self.model_path + '/trainlog.txt', "a+") printstr = '%f %f %f %f %f \n' % (score, std, episode, alpha, epsilon) file.write(printstr) file.close() self.writer.add_summary( self.sess.run(self.training_summary, feed_dict={self.avg_q: avg_q}), episode) # Chooses action wrt an e-greedy policy. # - state Tensor representing a single state # - epsilon Number in (0,1) # Output Integer in the range 0...self.action_size-1 representing an action def get_action(self, state, epsilon): # Performing epsilon-greedy action selection if random.random() < epsilon: return self.env.sample_action_space() else: return self.sess.run(self.Q_argmax, feed_dict={self.state_tf: [state]})[0] def get_oracle_action(self, env): env = env.env a = 4 car_x = env.car.hull.position[0] car_y = env.car.hull.position[1] car_angle = -env.car.hull.angle car_vel = np.linalg.norm(env.car.hull.linearVelocity) target_seg = 0 for i in range(len(env.road)): if not env.road[i].road_visited: target_seg = min(i + 3, len(env.road) - 1) break target_loc = env.nav_tiles[target_seg] #env.highlight_loc = target_loc angle_to = np.arctan2(target_loc[0] - car_x, target_loc[1] - car_y) - car_angle angle_to = (angle_to + 2 * np.pi) % (2 * np.pi) if angle_to > np.pi: angle_to -= 2 * np.pi vel_err = 35 - car_vel if vel_err > 2: a = 2 if angle_to < -0.15 * self.damping_mult: a = 0 if angle_to > 0.15 * self.damping_mult: a = 1 if a == 4: self.damping_mult /= 1.5 self.damping_mult = max(self.damping_mult, 1) else: self.damping_mult *= 1.2 return a # Tests the model def test(self, num_test_episodes, visualize): rewards = [] for episode in range(num_test_episodes): done = False state_lazy = self.env.reset(test=True) #input() self.env.render() state = np.array(state_lazy) episode_reward = 0 max_reward = float('-inf') while not done: if visualize: self.env.render() action = self.get_action(state, epsilon=0) next_state_lazy, reward, done, info = self.env.step(action, test=True) state = np.array(next_state_lazy) episode_reward += reward done = info['true_done'] if (self.env.env.t > 30): print("Ended due to time limit") done = True rewards.append(episode_reward) print(episode_reward) return np.mean(rewards), np.std(rewards), rewards # average Q-value over some number of fixed tracks def estimate_avg_q(self): if not self.q_grid: return 0 return np.average( np.amax(self.sess.run(self.Q_value, feed_dict={self.state_tf: self.q_grid}), axis=1)) # loads a model trained in a previous session # - path: String, giving the path to the checkpoint file to be loaded def load(self, path): self.saver.restore(self.sess, path)