Ejemplo n.º 1
0
def train():
    # sets train and evaluate data array
    train_data = []
    evaluate_data = []

    # sets the labels on the matrix
    fig, ax = plt.subplots()
    plt.imshow(np.zeros((N_RESULTS, N_RESULTS)))
    ax.set_xticklabels(["0"] + list(TYPES.keys()))

    ax.set_yticklabels(["0"] + list(TYPES.keys()))
    plt.setp(ax.get_xticklabels(), rotation=10)

    # runs through the file and creates an evaluate array [[data], result]
    try:
        with open("segmentation.data", "r") as arq:
            for line in arq:
                line_vector = line.split(",")
                evaluate_data.append([[
                    float(line_vector[number]) / MAX_DATA_VALUE
                    for number in range(1, len(line_vector))
                ], TYPES[line_vector[0]]])
    except:
        loggin.critical('segmentation data not found.')
        sys.exit()

    # runs through the file and creates a train array [[data], result]
    try:
        with open("segmentation.test", "r") as arq:
            for line in arq:
                line_vector = line.split(",")
                train_data.append([[
                    float(line_vector[number]) / MAX_DATA_VALUE
                    for number in range(1, len(line_vector))
                ], TYPES[line_vector[0]]])
    except:
        loggin.critical('segmentation data not found.')
        sys.exit()

    # creates the neural net
    neural_net = NeuralNet(N_ENTRIES, N_INTERMIDIATE_LAYERS,
                           INTERMIDIATE_LAYER_SIZE, N_EXITS)

    # creates the confusion matrix
    confusion_matrix = np.zeros((N_RESULTS, N_RESULTS))

    # creates the percent bar
    print('Epochs:')
    bar_epochs = progressbar.ProgressBar(widgets=PROGRESS_BAR_WIDGETS,
                                         max_value=N_EPOCHS)

    # starts the epochs loop
    for epoch in range(N_EPOCHS):
        confusion_matrix = np.zeros((N_RESULTS, N_RESULTS))
        # runs through the train data and feeds the neural net
        for item in train_data:
            train_in = item[0]
            expected = item[1]
            train_exit = neural_net.train(MOMENTUM, LEARNING_RATIO, train_in,
                                          expected)
            confusion_matrix[expected[0].tolist().index(
                1)] += train_exit[0].tolist()

        bar_epochs.update(epoch)

        # every 20 epochs, update the visualization, and saves both the neural net and the confusion matrix
        if (epoch % 20 == 0):
            confusion_matrix = np.zeros((N_RESULTS, N_RESULTS))
            for item in evaluate_data:
                evaluate_in = item[0]
                expected = item[1]
                evaluate_exit = neural_net.evaluate(evaluate_in)
                confusion_matrix[expected[0].tolist().index(
                    1)] += evaluate_exit[0].tolist()

            with open('results/neural_net_model.pkl', 'wb') as neural_net_file:
                pickle.dump(neural_net, neural_net_file)

            with open('results/confusion_matrix.pkl', 'wb') as matrix_file:
                pickle.dump(confusion_matrix, matrix_file)

            plt.imshow(confusion_matrix)
            plt.pause(1)
        plt.savefig('final_confusion_matrix.png')
Ejemplo n.º 2
0
class CarAgent:
    def __init__(self,
                 batch_size,
                 memory_capacity,
                 num_episodes,
                 learning_rate_drop_frame_limit,
                 target_update_frequency,
                 seeds=[104, 106, 108],
                 discount=0.99,
                 delta=1,
                 model_name=None,
                 visualize=False):

        self.env = CarEnvironment(seed=seeds)
        self.architecture = NeuralNet()
        self.explore_rate = Basic_Explore_Rate()
        self.learning_rate = Basic_Learning_Rate()
        self.model_path = os.path.dirname(
            os.path.realpath(__file__)) + '/models/' + model_name
        self.log_path = self.model_path + '/log'
        self.visualize = visualize
        self.damping_mult = 1

        self.initialize_tf_variables()

        self.target_update_frequency = target_update_frequency
        self.discount = discount
        self.replay_memory = Replay_Memory(memory_capacity, batch_size)
        self.training_metadata = Training_Metadata(
            frame=0,
            frame_limit=learning_rate_drop_frame_limit,
            episode=0,
            num_episodes=num_episodes)

        self.delta = delta
        document_parameters(self)

    # sets up tensorflow graph - called in setup
    def initialize_tf_variables(self):
        # Setting up game specific variables
        self.state_size = self.env.state_space_size
        self.action_size = self.env.action_space_size
        self.state_shape = self.env.state_shape
        self.q_grid = None

        # Tf placeholders - feeds data into neural net from outside
        self.state_tf = tf.placeholder(shape=self.state_shape,
                                       dtype=tf.float32,
                                       name='state_tf')
        self.action_tf = tf.placeholder(shape=[None, self.action_size],
                                        dtype=tf.float32,
                                        name='action_tf')
        self.y_tf = tf.placeholder(dtype=tf.float32, name='y_tf')
        self.alpha = tf.placeholder(dtype=tf.float32, name='alpha')
        self.test_score = tf.placeholder(dtype=tf.float32, name='test_score')
        self.avg_q = tf.placeholder(dtype=tf.float32, name='avg_q')

        # Keep track of episode and frames
        # Variables are used to store information about neural net
        self.episode = tf.Variable(initial_value=0,
                                   trainable=False,
                                   name='episode')
        self.frames = tf.Variable(initial_value=0,
                                  trainable=False,
                                  name='frames')
        self.increment_frames_op = tf.assign(self.frames,
                                             self.frames + 1,
                                             name='increment_frames_op')
        self.increment_episode_op = tf.assign(self.episode,
                                              self.episode + 1,
                                              name='increment_episode_op')

        # Operations
        # NAME                      DESCRIPTION                                         FEED DEPENDENCIES
        # Q_value                   Value of Q at given state(s)                        state_tf
        # Q_argmax                  Action(s) maximizing Q at given state(s)            state_tf
        # Q_amax                    Maximal action value(s) at given state(s)           state_tf
        # Q_value_at_action         Q value at specific (action, state) pair(s)         state_tf, action_tf
        # onehot_greedy_action      One-hot encodes greedy action(s) at given state(s)  state_tf
        self.Q_value = self.architecture.evaluate(self.state_tf,
                                                  self.action_size)
        self.Q_argmax = tf.argmax(self.Q_value, axis=1, name='Q_argmax')
        self.Q_amax = tf.reduce_max(self.Q_value, axis=1, name='Q_max')
        self.Q_value_at_action = tf.reduce_sum(tf.multiply(
            self.Q_value, self.action_tf),
                                               axis=1,
                                               name='Q_value_at_action')
        self.onehot_greedy_action = tf.one_hot(self.Q_argmax,
                                               depth=self.action_size)

        # Training related
        # NAME                          FEED DEPENDENCIES
        # loss                          y_tf, state_tf, action_tf
        # train_op                      y_tf, state_tf, action_tf, alpha
        self.loss = tf.losses.huber_loss(self.y_tf, self.Q_value_at_action)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.alpha)
        self.train_op = self.optimizer.minimize(self.loss,
                                                name='train_minimize')

        # Tensorflow session setup
        self.saver = tf.train.Saver(max_to_keep=None)
        config = tf.ConfigProto()
        config.allow_soft_placement = True
        config.gpu_options.allow_growth = False
        config.log_device_placement = False
        self.sess = tf.Session(config=config)
        self.trainable_variables = tf.trainable_variables()
        print(self.trainable_variables)

        # Tensorboard setup
        self.writer = tf.summary.FileWriter(self.log_path)
        self.writer.add_graph(self.sess.graph)
        test_score = tf.summary.scalar("Training score",
                                       self.test_score,
                                       collections=None,
                                       family=None)
        avg_q = tf.summary.scalar("Average Q-value",
                                  self.avg_q,
                                  collections=None,
                                  family=None)
        self.training_summary = tf.summary.merge([avg_q])
        self.test_summary = tf.summary.merge([test_score])
        # subprocess.Popen(['tensorboard', '--logdir', self.log_path])

        # Initialising variables and finalising graph
        self.sess.run(tf.global_variables_initializer())
        self.fixed_target_weights = self.sess.run(self.trainable_variables)

        self.sess.graph.finalize()

    # Performs one step of batch gradient descent on the DDQN loss function.
    # alpha = learning rate
    def experience_replay(self, alpha):

        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_memory.get_mini_batch(
            self.training_metadata)

        # get argmax of q-network
        greedy_actions = self.sess.run(
            self.onehot_greedy_action,
            feed_dict={self.state_tf: next_state_batch})

        y_batch = [None] * self.replay_memory.batch_size
        fixed_feed_dict = {
            self.state_tf: next_state_batch,
            self.action_tf: greedy_actions
        }
        fixed_feed_dict.update(
            zip(self.trainable_variables, self.fixed_target_weights))
        # fixed_feed_dict.update()

        Q_batch = self.sess.run(self.Q_value_at_action,
                                feed_dict=fixed_feed_dict)

        y_batch = reward_batch + self.discount * np.multiply(
            np.invert(done_batch), Q_batch)

        feed = {
            self.state_tf: state_batch,
            self.action_tf: action_batch,
            self.y_tf: y_batch,
            self.alpha: alpha
        }
        self.sess.run(self.train_op, feed_dict=feed)

    # Updates weights of target network
    def update_fixed_target_weights(self):
        self.fixed_target_weights = self.sess.run(self.trainable_variables)

    # Trains the model
    def train(self, imitation=False):
        while self.sess.run(
                self.episode) < self.training_metadata.num_episodes:

            #basically grapb the episode number from the neural net
            episode = self.sess.run(self.episode)
            self.training_metadata.increment_episode()
            # increments the episode in the neural net
            self.sess.run(self.increment_episode_op)

            # set up car environment
            state_lazy = self.env.reset()
            self.env.render()

            done = False
            epsilon = self.explore_rate.get(self.training_metadata)
            alpha = self.learning_rate.get(self.training_metadata)

            print("Episode {0}/{1} \t Epsilon: {2} \t Alpha: {3}".format(
                episode, self.training_metadata.num_episodes, epsilon, alpha))
            print("Replay Memory: %d" % self.replay_memory.length())
            episode_frame = 0

            max_reward = float('-inf')

            while True:

                # Update target weights every update frequency
                if self.training_metadata.frame % self.target_update_frequency == 0 and (
                        self.training_metadata.frame != 0):
                    self.update_fixed_target_weights()

                # Choose and perform action and update replay memory

                if random.random() < epsilon:
                    if imitation:
                        action = self.get_oracle_action(self.env)
                    else:
                        action = self.env.sample_action_space()
                else:
                    action = self.get_action(np.array(state_lazy), 0)

                next_state_lazy, reward, done, info = self.env.step(action)

                if self.visualize:
                    self.env.render()

                episode_frame += 1

                self.replay_memory.add(self, state_lazy, action, reward,
                                       next_state_lazy, done)

                # Train with replay memory if populated
                if self.replay_memory.length(
                ) > 10 * self.replay_memory.batch_size:
                    self.sess.run(self.increment_frames_op)
                    self.training_metadata.increment_frame()
                    self.experience_replay(alpha)

                avg_q = self.estimate_avg_q()

                state_lazy = next_state_lazy
                done = info['true_done']

                abs_reward = self.env.get_total_reward()
                max_reward = max(max_reward, abs_reward)

                if max_reward - abs_reward > 5 or done:
                    print("Episode reward:", abs_reward)
                    break

            # Saving tensorboard data and model weights
            if (episode % 30 == 0) and (episode != 0):
                score, std, rewards = self.test(num_test_episodes=5,
                                                visualize=self.visualize)
                print('{0} +- {1}'.format(score, std))
                self.writer.add_summary(
                    self.sess.run(self.test_summary,
                                  feed_dict={self.test_score: score}),
                    episode / 30)
                self.saver.save(self.sess,
                                self.model_path + '/data.chkp',
                                global_step=self.training_metadata.episode)

                file = open(self.model_path + '/trainlog.txt', "a+")
                printstr = '%f %f %f %f %f \n' % (score, std, episode, alpha,
                                                  epsilon)
                file.write(printstr)
                file.close()

            self.writer.add_summary(
                self.sess.run(self.training_summary,
                              feed_dict={self.avg_q: avg_q}), episode)

    # Chooses action wrt an e-greedy policy.
    # - state      Tensor representing a single state
    # - epsilon    Number in (0,1)
    # Output       Integer in the range 0...self.action_size-1 representing an action
    def get_action(self, state, epsilon):
        # Performing epsilon-greedy action selection
        if random.random() < epsilon:
            return self.env.sample_action_space()
        else:
            return self.sess.run(self.Q_argmax,
                                 feed_dict={self.state_tf: [state]})[0]

    def get_oracle_action(self, env):
        env = env.env
        a = 4

        car_x = env.car.hull.position[0]
        car_y = env.car.hull.position[1]
        car_angle = -env.car.hull.angle
        car_vel = np.linalg.norm(env.car.hull.linearVelocity)

        target_seg = 0
        for i in range(len(env.road)):
            if not env.road[i].road_visited:
                target_seg = min(i + 3, len(env.road) - 1)
                break

        target_loc = env.nav_tiles[target_seg]
        #env.highlight_loc = target_loc
        angle_to = np.arctan2(target_loc[0] - car_x,
                              target_loc[1] - car_y) - car_angle
        angle_to = (angle_to + 2 * np.pi) % (2 * np.pi)

        if angle_to > np.pi:
            angle_to -= 2 * np.pi

        vel_err = 35 - car_vel
        if vel_err > 2:
            a = 2

        if angle_to < -0.15 * self.damping_mult:
            a = 0

        if angle_to > 0.15 * self.damping_mult:
            a = 1

        if a == 4:
            self.damping_mult /= 1.5
            self.damping_mult = max(self.damping_mult, 1)
        else:
            self.damping_mult *= 1.2

        return a

    # Tests the model
    def test(self, num_test_episodes, visualize):
        rewards = []
        for episode in range(num_test_episodes):
            done = False
            state_lazy = self.env.reset(test=True)
            #input()
            self.env.render()

            state = np.array(state_lazy)
            episode_reward = 0
            max_reward = float('-inf')
            while not done:
                if visualize:
                    self.env.render()
                action = self.get_action(state, epsilon=0)
                next_state_lazy, reward, done, info = self.env.step(action,
                                                                    test=True)
                state = np.array(next_state_lazy)
                episode_reward += reward
                done = info['true_done']

                if (self.env.env.t > 30):
                    print("Ended due to time limit")
                    done = True

            rewards.append(episode_reward)
            print(episode_reward)
        return np.mean(rewards), np.std(rewards), rewards

    # average Q-value over some number of fixed tracks
    def estimate_avg_q(self):
        if not self.q_grid:
            return 0
        return np.average(
            np.amax(self.sess.run(self.Q_value,
                                  feed_dict={self.state_tf: self.q_grid}),
                    axis=1))

    # loads a model trained in a previous session
    # - path:   String, giving the path to the checkpoint file to be loaded
    def load(self, path):
        self.saver.restore(self.sess, path)