예제 #1
0
    def __init__(self,
                 batch_size,
                 memory_capacity,
                 num_episodes,
                 learning_rate_drop_frame_limit,
                 target_update_frequency,
                 seeds=[104, 106, 108],
                 discount=0.99,
                 delta=1,
                 model_name=None,
                 visualize=False):

        self.env = CarEnvironment(seed=seeds)
        self.architecture = NeuralNet()
        self.explore_rate = Basic_Explore_Rate()
        self.learning_rate = Basic_Learning_Rate()
        self.model_path = os.path.dirname(
            os.path.realpath(__file__)) + '/models/' + model_name
        self.log_path = self.model_path + '/log'
        self.visualize = visualize
        self.damping_mult = 1

        self.initialize_tf_variables()

        self.target_update_frequency = target_update_frequency
        self.discount = discount
        self.replay_memory = Replay_Memory(memory_capacity, batch_size)
        self.training_metadata = Training_Metadata(
            frame=0,
            frame_limit=learning_rate_drop_frame_limit,
            episode=0,
            num_episodes=num_episodes)

        self.delta = delta
        document_parameters(self)
예제 #2
0
    def __init__(self):
        self.action_size = 3
        self.state_size = 2000000000
        self.qtable = np.zeros((self.state_size, self.action_size))

        self.total_episodes = 10000  # Total episodes
        self.learning_rate = 0.8  # Learning rate
        self.max_steps = 10000  # Max steps per episode
        self.gamma = 0.95  # Discounting rate

        # Exploration parameters
        self.epsilon = 1.0  # Exploration rate
        self.max_epsilon = 1.0  # Exploration probability at start
        self.min_epsilon = 0.01  # Minimum exploration probability
        self.decay_rate = 0.005  # Exponential decay rate for exploration prob

        self.env = CarEnvironment()

        self.train()
예제 #3
0
    def __init__(self):
        # Set to False to let the agent play
        self.training = False

        self.state_size = 5
        self.action_size = 3

        self.max_episodes = 500
        self.learning_rate = 0.01
        self.gamma = 0.95

        self.init_networks()
        self.init_tensorboard()

        self.env = CarEnvironment()

        self.saver = tf.train.Saver()

        if self.training:
            self.train()
        else:
            self.play()
예제 #4
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    alg = args.alg
    seed = args.seed

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=1,
                            inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    get_session(config=config)

    env = CarEnvironment()

    return env
예제 #5
0
class PGN():
    def __init__(self):
        # Set to False to let the agent play
        self.training = False

        self.state_size = 5
        self.action_size = 3

        self.max_episodes = 500
        self.learning_rate = 0.01
        self.gamma = 0.95

        self.init_networks()
        self.init_tensorboard()

        self.env = CarEnvironment()

        self.saver = tf.train.Saver()

        if self.training:
            self.train()
        else:
            self.play()

    """
  Initialize all the networks
  """

    def init_networks(self):
        with tf.name_scope("inputs"):
            self.input_ = tf.placeholder(tf.float32, [None, self.state_size],
                                         name="input_")
            self.actions = tf.placeholder(tf.int32, [None, self.action_size],
                                          name="actions")
            self.discounted_episode_rewards_ = tf.placeholder(
                tf.float32, [
                    None,
                ], name="discounted_episode_rewards")

            self.mean_reward_ = tf.placeholder(tf.float32, name="mean_reward")

            with tf.name_scope("fc1"):
                self.fc1 = tf.contrib.layers.fully_connected(
                    inputs=self.input_,
                    num_outputs=10,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.contrib.layers.xavier_initializer())

            with tf.name_scope("fc2"):
                self.fc2 = tf.contrib.layers.fully_connected(
                    inputs=self.fc1,
                    num_outputs=self.action_size,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.contrib.layers.xavier_initializer())

            with tf.name_scope("fc3"):
                self.fc3 = tf.contrib.layers.fully_connected(
                    inputs=self.fc2,
                    num_outputs=self.action_size,
                    activation_fn=None,
                    weights_initializer=tf.contrib.layers.xavier_initializer())

            with tf.name_scope("softmax"):
                self.action_distribution = tf.nn.softmax(self.fc3)

            with tf.name_scope("loss"):
                self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=self.fc3, labels=self.actions)
                self.loss = tf.reduce_mean(self.neg_log_prob *
                                           self.discounted_episode_rewards_)

            with tf.name_scope("train"):
                self.train_opt = tf.train.AdamOptimizer(
                    self.learning_rate).minimize(self.loss)

    """
  Set up tensorboard
  """

    def init_tensorboard(self):
        # Setup TensorBoard Writer
        self.writer = tf.summary.FileWriter("tensorboard/pg/1")

        ## Losses
        tf.summary.scalar("Loss", self.loss)

        ## Reward mean
        tf.summary.scalar("Reward_mean", self.mean_reward_)

        self.write_op = tf.summary.merge_all()

    def discount_and_normalize_rewards(self, episode_rewards):
        discounted_episode_rewards = np.zeros_like(episode_rewards)
        cumulative = 0.0
        for i in reversed(range(len(episode_rewards))):
            cumulative = cumulative * self.gamma + episode_rewards[i]
            discounted_episode_rewards[i] = cumulative

        mean = np.mean(discounted_episode_rewards)
        std = np.std(discounted_episode_rewards)
        discounted_episode_rewards = (discounted_episode_rewards -
                                      mean) / (std)

        return discounted_episode_rewards

    def train(self):
        allRewards = []
        total_rewards = 0
        total_dist = 0
        all_dist = []
        maximumRewardRecorded = 0
        episode = 0
        episode_states, episode_actions, episode_rewards = [], [], []

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            for episode in range(self.max_episodes):

                episode_rewards_sum = 0

                # Launch the game
                state = self.env.reset()

                while True:

                    # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
                    action_probability_distribution = sess.run(
                        self.action_distribution,
                        feed_dict={self.input_: state.reshape([1, 5])})

                    action = np.random.choice(
                        range(action_probability_distribution.shape[1]),
                        p=action_probability_distribution.ravel(
                        ))  # select action w.r.t the actions prob

                    # Perform a
                    new_state, reward, dist, done = self.env.step(action)

                    total_dist += dist

                    # Store s, a, r
                    episode_states.append(state)

                    # For actions because we output only one (the index) we need 2 (1 is for the action taken)
                    # We need [0., 1.] (if we take right) not just the index
                    action_ = np.zeros(self.action_size)
                    action_[action] = 1

                    episode_actions.append(action_)

                    episode_rewards.append(reward)
                    if done:
                        # Calculate sum reward
                        episode_rewards_sum = np.sum(episode_rewards)

                        allRewards.append(episode_rewards_sum)

                        total_rewards = np.sum(allRewards)

                        # Mean reward
                        mean_reward = np.divide(total_rewards, episode + 1)

                        maximumRewardRecorded = np.amax(allRewards)

                        print("==========================================")
                        print("Episode: ", episode)
                        print("Reward: ", episode_rewards_sum)
                        print("Mean Reward", mean_reward)
                        print("Max reward so far: ", maximumRewardRecorded)

                        # Calculate discounted reward
                        discounted_episode_rewards = self.discount_and_normalize_rewards(
                            episode_rewards)

                        # Feedforward, gradient and backpropagation
                        loss_, _ = sess.run(
                            [self.loss, self.train_opt],
                            feed_dict={
                                self.input_:
                                np.vstack(np.array(episode_states)),
                                self.actions:
                                np.vstack(np.array(episode_actions)),
                                self.discounted_episode_rewards_:
                                discounted_episode_rewards
                            })

                        # Write TF Summaries
                        summary = sess.run(
                            self.write_op,
                            feed_dict={
                                self.input_:
                                np.vstack(np.array(episode_states)),
                                self.actions:
                                np.vstack(np.array(episode_actions)),
                                self.discounted_episode_rewards_:
                                discounted_episode_rewards,
                                self.mean_reward_: mean_reward
                            })

                        self.writer.add_summary(summary, episode)
                        self.writer.flush()

                        # Reset the transition stores
                        episode_states, episode_actions, episode_rewards = [],[],[]

                        all_dist.append([episode, total_dist])

                        total_dist = 0

                        break

                    state = new_state

                # Save Model
                self.saver.save(sess, "./models/model.ckpt")
                print("Model saved")

                a = np.asarray(all_dist)
                np.savetxt("test.csv", a, delimiter=',')

    def play(self):
        with tf.Session() as sess:
            self.env.reset()
            rewards = []

            # Load the model
            self.saver.restore(sess, "./models/model.ckpt")

            for episode in range(10):
                state = self.env.reset()
                step = 0
                done = False
                total_rewards = 0
                print("****************************************************")
                print("EPISODE ", episode)

                while True:

                    # Choose action a, remember WE'RE NOT IN A DETERMINISTIC ENVIRONMENT, WE'RE OUTPUT PROBABILITIES.
                    action_probability_distribution = sess.run(
                        self.action_distribution,
                        feed_dict={self.input_: state.reshape([1, 5])})
                    #print(action_probability_distribution)
                    action = np.random.choice(
                        range(action_probability_distribution.shape[1]),
                        p=action_probability_distribution.ravel(
                        ))  # select action w.r.t the actions prob

                    new_state, reward, dist, done = self.env.step(action)

                    total_rewards += reward

                    if done:
                        rewards.append(total_rewards)
                        print("Score", total_rewards)
                        break
                    state = new_state
            print("Score over time: " + str(sum(rewards) / 10))
예제 #6
0
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

parser = argparse.ArgumentParser()
parser.add_argument('--mode', choices=['train', 'test'], default='train')
parser.add_argument('--env-name', type=str, default='BreakoutDeterministic-v4')
parser.add_argument('--weights', type=str, default=None)
args = parser.parse_args()



# Get the environment and extract the number of actions.
env = CarEnvironment() 
np.random.seed(123)
nb_actions = env.action_space.n

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()
if K.image_dim_ordering() == 'tf':
    # (width, height, channels)
    model.add(Permute((2, 3, 1), input_shape=input_shape))
elif K.image_dim_ordering() == 'th':
    # (channels, width, height)
    model.add(Permute((1, 2, 3), input_shape=input_shape))
else:
    raise RuntimeError('Unknown image_dim_ordering.')
model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
예제 #7
0
class CarAgent:
    def __init__(self,
                 batch_size,
                 memory_capacity,
                 num_episodes,
                 learning_rate_drop_frame_limit,
                 target_update_frequency,
                 seeds=[104, 106, 108],
                 discount=0.99,
                 delta=1,
                 model_name=None,
                 visualize=False):

        self.env = CarEnvironment(seed=seeds)
        self.architecture = NeuralNet()
        self.explore_rate = Basic_Explore_Rate()
        self.learning_rate = Basic_Learning_Rate()
        self.model_path = os.path.dirname(
            os.path.realpath(__file__)) + '/models/' + model_name
        self.log_path = self.model_path + '/log'
        self.visualize = visualize
        self.damping_mult = 1

        self.initialize_tf_variables()

        self.target_update_frequency = target_update_frequency
        self.discount = discount
        self.replay_memory = Replay_Memory(memory_capacity, batch_size)
        self.training_metadata = Training_Metadata(
            frame=0,
            frame_limit=learning_rate_drop_frame_limit,
            episode=0,
            num_episodes=num_episodes)

        self.delta = delta
        document_parameters(self)

    # sets up tensorflow graph - called in setup
    def initialize_tf_variables(self):
        # Setting up game specific variables
        self.state_size = self.env.state_space_size
        self.action_size = self.env.action_space_size
        self.state_shape = self.env.state_shape
        self.q_grid = None

        # Tf placeholders - feeds data into neural net from outside
        self.state_tf = tf.placeholder(shape=self.state_shape,
                                       dtype=tf.float32,
                                       name='state_tf')
        self.action_tf = tf.placeholder(shape=[None, self.action_size],
                                        dtype=tf.float32,
                                        name='action_tf')
        self.y_tf = tf.placeholder(dtype=tf.float32, name='y_tf')
        self.alpha = tf.placeholder(dtype=tf.float32, name='alpha')
        self.test_score = tf.placeholder(dtype=tf.float32, name='test_score')
        self.avg_q = tf.placeholder(dtype=tf.float32, name='avg_q')

        # Keep track of episode and frames
        # Variables are used to store information about neural net
        self.episode = tf.Variable(initial_value=0,
                                   trainable=False,
                                   name='episode')
        self.frames = tf.Variable(initial_value=0,
                                  trainable=False,
                                  name='frames')
        self.increment_frames_op = tf.assign(self.frames,
                                             self.frames + 1,
                                             name='increment_frames_op')
        self.increment_episode_op = tf.assign(self.episode,
                                              self.episode + 1,
                                              name='increment_episode_op')

        # Operations
        # NAME                      DESCRIPTION                                         FEED DEPENDENCIES
        # Q_value                   Value of Q at given state(s)                        state_tf
        # Q_argmax                  Action(s) maximizing Q at given state(s)            state_tf
        # Q_amax                    Maximal action value(s) at given state(s)           state_tf
        # Q_value_at_action         Q value at specific (action, state) pair(s)         state_tf, action_tf
        # onehot_greedy_action      One-hot encodes greedy action(s) at given state(s)  state_tf
        self.Q_value = self.architecture.evaluate(self.state_tf,
                                                  self.action_size)
        self.Q_argmax = tf.argmax(self.Q_value, axis=1, name='Q_argmax')
        self.Q_amax = tf.reduce_max(self.Q_value, axis=1, name='Q_max')
        self.Q_value_at_action = tf.reduce_sum(tf.multiply(
            self.Q_value, self.action_tf),
                                               axis=1,
                                               name='Q_value_at_action')
        self.onehot_greedy_action = tf.one_hot(self.Q_argmax,
                                               depth=self.action_size)

        # Training related
        # NAME                          FEED DEPENDENCIES
        # loss                          y_tf, state_tf, action_tf
        # train_op                      y_tf, state_tf, action_tf, alpha
        self.loss = tf.losses.huber_loss(self.y_tf, self.Q_value_at_action)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.alpha)
        self.train_op = self.optimizer.minimize(self.loss,
                                                name='train_minimize')

        # Tensorflow session setup
        self.saver = tf.train.Saver(max_to_keep=None)
        config = tf.ConfigProto()
        config.allow_soft_placement = True
        config.gpu_options.allow_growth = False
        config.log_device_placement = False
        self.sess = tf.Session(config=config)
        self.trainable_variables = tf.trainable_variables()
        print(self.trainable_variables)

        # Tensorboard setup
        self.writer = tf.summary.FileWriter(self.log_path)
        self.writer.add_graph(self.sess.graph)
        test_score = tf.summary.scalar("Training score",
                                       self.test_score,
                                       collections=None,
                                       family=None)
        avg_q = tf.summary.scalar("Average Q-value",
                                  self.avg_q,
                                  collections=None,
                                  family=None)
        self.training_summary = tf.summary.merge([avg_q])
        self.test_summary = tf.summary.merge([test_score])
        # subprocess.Popen(['tensorboard', '--logdir', self.log_path])

        # Initialising variables and finalising graph
        self.sess.run(tf.global_variables_initializer())
        self.fixed_target_weights = self.sess.run(self.trainable_variables)

        self.sess.graph.finalize()

    # Performs one step of batch gradient descent on the DDQN loss function.
    # alpha = learning rate
    def experience_replay(self, alpha):

        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_memory.get_mini_batch(
            self.training_metadata)

        # get argmax of q-network
        greedy_actions = self.sess.run(
            self.onehot_greedy_action,
            feed_dict={self.state_tf: next_state_batch})

        y_batch = [None] * self.replay_memory.batch_size
        fixed_feed_dict = {
            self.state_tf: next_state_batch,
            self.action_tf: greedy_actions
        }
        fixed_feed_dict.update(
            zip(self.trainable_variables, self.fixed_target_weights))
        # fixed_feed_dict.update()

        Q_batch = self.sess.run(self.Q_value_at_action,
                                feed_dict=fixed_feed_dict)

        y_batch = reward_batch + self.discount * np.multiply(
            np.invert(done_batch), Q_batch)

        feed = {
            self.state_tf: state_batch,
            self.action_tf: action_batch,
            self.y_tf: y_batch,
            self.alpha: alpha
        }
        self.sess.run(self.train_op, feed_dict=feed)

    # Updates weights of target network
    def update_fixed_target_weights(self):
        self.fixed_target_weights = self.sess.run(self.trainable_variables)

    # Trains the model
    def train(self, imitation=False):
        while self.sess.run(
                self.episode) < self.training_metadata.num_episodes:

            #basically grapb the episode number from the neural net
            episode = self.sess.run(self.episode)
            self.training_metadata.increment_episode()
            # increments the episode in the neural net
            self.sess.run(self.increment_episode_op)

            # set up car environment
            state_lazy = self.env.reset()
            self.env.render()

            done = False
            epsilon = self.explore_rate.get(self.training_metadata)
            alpha = self.learning_rate.get(self.training_metadata)

            print("Episode {0}/{1} \t Epsilon: {2} \t Alpha: {3}".format(
                episode, self.training_metadata.num_episodes, epsilon, alpha))
            print("Replay Memory: %d" % self.replay_memory.length())
            episode_frame = 0

            max_reward = float('-inf')

            while True:

                # Update target weights every update frequency
                if self.training_metadata.frame % self.target_update_frequency == 0 and (
                        self.training_metadata.frame != 0):
                    self.update_fixed_target_weights()

                # Choose and perform action and update replay memory

                if random.random() < epsilon:
                    if imitation:
                        action = self.get_oracle_action(self.env)
                    else:
                        action = self.env.sample_action_space()
                else:
                    action = self.get_action(np.array(state_lazy), 0)

                next_state_lazy, reward, done, info = self.env.step(action)

                if self.visualize:
                    self.env.render()

                episode_frame += 1

                self.replay_memory.add(self, state_lazy, action, reward,
                                       next_state_lazy, done)

                # Train with replay memory if populated
                if self.replay_memory.length(
                ) > 10 * self.replay_memory.batch_size:
                    self.sess.run(self.increment_frames_op)
                    self.training_metadata.increment_frame()
                    self.experience_replay(alpha)

                avg_q = self.estimate_avg_q()

                state_lazy = next_state_lazy
                done = info['true_done']

                abs_reward = self.env.get_total_reward()
                max_reward = max(max_reward, abs_reward)

                if max_reward - abs_reward > 5 or done:
                    print("Episode reward:", abs_reward)
                    break

            # Saving tensorboard data and model weights
            if (episode % 30 == 0) and (episode != 0):
                score, std, rewards = self.test(num_test_episodes=5,
                                                visualize=self.visualize)
                print('{0} +- {1}'.format(score, std))
                self.writer.add_summary(
                    self.sess.run(self.test_summary,
                                  feed_dict={self.test_score: score}),
                    episode / 30)
                self.saver.save(self.sess,
                                self.model_path + '/data.chkp',
                                global_step=self.training_metadata.episode)

                file = open(self.model_path + '/trainlog.txt', "a+")
                printstr = '%f %f %f %f %f \n' % (score, std, episode, alpha,
                                                  epsilon)
                file.write(printstr)
                file.close()

            self.writer.add_summary(
                self.sess.run(self.training_summary,
                              feed_dict={self.avg_q: avg_q}), episode)

    # Chooses action wrt an e-greedy policy.
    # - state      Tensor representing a single state
    # - epsilon    Number in (0,1)
    # Output       Integer in the range 0...self.action_size-1 representing an action
    def get_action(self, state, epsilon):
        # Performing epsilon-greedy action selection
        if random.random() < epsilon:
            return self.env.sample_action_space()
        else:
            return self.sess.run(self.Q_argmax,
                                 feed_dict={self.state_tf: [state]})[0]

    def get_oracle_action(self, env):
        env = env.env
        a = 4

        car_x = env.car.hull.position[0]
        car_y = env.car.hull.position[1]
        car_angle = -env.car.hull.angle
        car_vel = np.linalg.norm(env.car.hull.linearVelocity)

        target_seg = 0
        for i in range(len(env.road)):
            if not env.road[i].road_visited:
                target_seg = min(i + 3, len(env.road) - 1)
                break

        target_loc = env.nav_tiles[target_seg]
        #env.highlight_loc = target_loc
        angle_to = np.arctan2(target_loc[0] - car_x,
                              target_loc[1] - car_y) - car_angle
        angle_to = (angle_to + 2 * np.pi) % (2 * np.pi)

        if angle_to > np.pi:
            angle_to -= 2 * np.pi

        vel_err = 35 - car_vel
        if vel_err > 2:
            a = 2

        if angle_to < -0.15 * self.damping_mult:
            a = 0

        if angle_to > 0.15 * self.damping_mult:
            a = 1

        if a == 4:
            self.damping_mult /= 1.5
            self.damping_mult = max(self.damping_mult, 1)
        else:
            self.damping_mult *= 1.2

        return a

    # Tests the model
    def test(self, num_test_episodes, visualize):
        rewards = []
        for episode in range(num_test_episodes):
            done = False
            state_lazy = self.env.reset(test=True)
            #input()
            self.env.render()

            state = np.array(state_lazy)
            episode_reward = 0
            max_reward = float('-inf')
            while not done:
                if visualize:
                    self.env.render()
                action = self.get_action(state, epsilon=0)
                next_state_lazy, reward, done, info = self.env.step(action,
                                                                    test=True)
                state = np.array(next_state_lazy)
                episode_reward += reward
                done = info['true_done']

                if (self.env.env.t > 30):
                    print("Ended due to time limit")
                    done = True

            rewards.append(episode_reward)
            print(episode_reward)
        return np.mean(rewards), np.std(rewards), rewards

    # average Q-value over some number of fixed tracks
    def estimate_avg_q(self):
        if not self.q_grid:
            return 0
        return np.average(
            np.amax(self.sess.run(self.Q_value,
                                  feed_dict={self.state_tf: self.q_grid}),
                    axis=1))

    # loads a model trained in a previous session
    # - path:   String, giving the path to the checkpoint file to be loaded
    def load(self, path):
        self.saver.restore(self.sess, path)
예제 #8
0
class QN():
    def __init__(self):
        self.action_size = 3
        self.state_size = 2000000000
        self.qtable = np.zeros((self.state_size, self.action_size))

        self.total_episodes = 10000  # Total episodes
        self.learning_rate = 0.8  # Learning rate
        self.max_steps = 10000  # Max steps per episode
        self.gamma = 0.95  # Discounting rate

        # Exploration parameters
        self.epsilon = 1.0  # Exploration rate
        self.max_epsilon = 1.0  # Exploration probability at start
        self.min_epsilon = 0.01  # Minimum exploration probability
        self.decay_rate = 0.005  # Exponential decay rate for exploration prob

        self.env = CarEnvironment()

        self.train()

    def train(self):
        rewards = []
        all_dist = []
        total_dist = 0

        # 2 For life or until learning is stopped
        for episode in range(self.total_episodes):
            # Reset the environment
            state = self.env.reset()
            state = 0
            step = 0
            done = False
            total_rewards = 0

            for step in range(self.max_steps):
                # 3. Choose an action a in the current world state (s)
                ## First we randomize a number
                exp_exp_tradeoff = random.uniform(0, 1)

                ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
                if exp_exp_tradeoff > self.epsilon:
                    action = np.argmax(self.qtable[state, :])

                # Else doing a random choice --> exploration
                else:
                    action = random.randint(0, 2)

                # Take the action (a) and observe the outcome state(s') and reward (r)
                s, reward, dist, done = self.env.step(action)

                x = ""
                for i in np.array(s).flatten():
                    x += str(int(round(i)))

                print("State: " + x)
                new_state = int(x)

                # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
                # qtable[new_state,:] : all the actions we can take from new state
                self.qtable[state, action] = self.qtable[
                    state, action] + self.learning_rate * (
                        reward + self.gamma * np.max(self.qtable[new_state, :])
                        - self.qtable[state, action])

                total_rewards += reward
                total_dist += dist

                # Our new state is state
                state = new_state

                # If done (if we're dead) : finish episode
                if done == True:
                    print(f"Ep ended: {episode}")
                    break

            # Reduce epsilon (because we need less and less exploration)
            self.epsilon = self.min_epsilon + (self.max_epsilon -
                                               self.min_epsilon) * np.exp(
                                                   -self.decay_rate * episode)
            rewards.append(total_rewards)
            all_dist.append(total_dist)

            a = np.asarray(all_dist)
            np.savetxt("ql.csv", a, delimiter=',')