Esempio n. 1
0
def main():
    """main method
    
    log runtime and print it at the end
    """
    s_time = timeit.default_timer()     
    global iteration
    env = TorcsEnv(vision=False, throttle=True, gear_change=False)
    memory = ReplayBuffer()
    epsilon = 1
    train_indicator = True
    modelPATH = os.path.join('.',"models",'E0011.pt')

    q,q_target = QNet(state_dim,action_dim),QNet(state_dim,action_dim)
    q_target.load_state_dict(q.state_dict())
    mu, mu_target = MuNet(state_dim), MuNet(state_dim)
    mu_target.load_state_dict(mu.state_dict())
    steer_noise = OUN(np.zeros(1),theta = 0.6)
    accel_noise = OUN(np.zeros(1),theta = 0.6)
    mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
    q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)

    #tensorboard writer
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.join("logs", "ddpg_torch", current_time+'E0011t')
    writer = SummaryWriter(log_dir)
    samplestate = torch.rand(1,29)
    sampleaction = torch.rand(1,2)

    #writer.add_graph(mu,samplestate)
    writer.add_graph(q,(samplestate,sampleaction))
    writer.close

    if train_indicator ==False:
        mu = torch.load(modelPATH)
        mu.eval()
        ob = env.reset()
        score = 0
        for n_step in range(100000):
            s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
            a_t = mu(torch.from_numpy(s_t.reshape(1,-1)).float()).detach().numpy()
            ob,r_t,done,_ = env.step(a_t[0])
            score += r_t
            if done:
                print("score:",score)
                break
        env.end()
        return 0

    for n_epi in range(max_episode):
        print("Episode : " + str(n_epi) + " Replay Buffer " + str(memory.size()))
        if np.mod(n_epi, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()
        a_t = np.zeros([1,action_dim])
        s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        score = 0
        q_value_writer(q, mu, s_t, writer, 'Episode Start Q value')
        q_value_writer(q_target, mu_target, s_t, writer, 'Episode Start target Q value')
        #t_start = timeit.default_timer()
        for n_step in range(max_step):
            #epsilon -= 1.0/EXPLORE
            a_origin = mu(torch.from_numpy(s_t.reshape(1,-1)).float())
            if train_indicator == True:#add noise for train
                # sn = max(epsilon,0)*steer_noise()
                sn = steer_noise()
                # an = max(epsilon,0)*accel_noise()
                an = accel_noise()
                a_s = a_origin.detach().numpy()[0][0] + sn
                a_t[0][0] = np.clip(a_s,-1,1) # fit in steer arange
                a_a = a_origin.detach().numpy()[0][1] + an
                a_t[0][1] = np.clip(a_a,0,1) # fit in accel arange
                #record noise movement
                if iteration%10==0:
                    writer.add_scalar('Steer noise', sn, iteration)
                    writer.add_scalar('Accel_noise', an, iteration)
            else:
                a_t = a_origin.detatch().numpy()
            ob,r_t,done,_ = env.step(a_t[0])
            score += r_t

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
            memory.put((s_t,a_t[0],r_t,s_t1,done))
            s_temp = copy.deepcopy(s_t) # for end q value log
            s_t = s_t1

            if train_indicator and memory.size()>train_start_size:
                train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer,writer)
                soft_update(mu, mu_target)
                soft_update(q,  q_target)
            
            iteration+=1

            if done:
                q_value_writer(q,mu,s_temp,writer,'Episode End Q value')
                q_value_writer(q_target,mu_target,s_temp,writer,'Episode End target Q value')
                break
        #t_end = timeit.default_timer()
        
        print("TOTAL REWARD @ " + str(n_epi) +"-th Episode  : Reward " + str(score))
        print("Total Step: " + str(n_step))
        print("")
        #print('{}steps, {} time spent'.format(i,t_end-t_start))
    
    torch.save(mu,modelPATH)
    
    env.end()
    
    e_time = timeit.default_timer()
    print("Total step {} and time spent {}".format(iteration, e_time-s_time))
Esempio n. 2
0
class DDPG:
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)

    def update_target(self):
        # Two methods to update the target actor
        # Method 1:
        self.target_actor.set_weights(
            np.array(self.actor.get_weights()) * self.TAU +
            np.array(self.target_actor.get_weights()) * (1 - self.TAU))
        self.target_critic.set_weights(
            np.array(self.critic.get_weights()) * self.TAU +
            np.array(self.target_critic.get_weights()) * (1 - self.TAU))
        """
        # Method 2:
        new_weights = []
        target_variables = self.target_critic.weights
        for i, variable in enumerate(self.critic.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))

        self.target_critic.set_weights(new_weights)
        new_weights = []
        target_variables = self.target_actor.weights
        for i, variable in enumerate(self.actor.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))
        self.target_actor.set_weights(new_weights)
        """

    def train_step(self):
        s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch(
            self.minibatch_size)
        """
        mu_prime = self.target_actor(s2_batch)  # predictions by target actor
        Q_prime = self.target_critic([s2_batch, mu_prime])  # predictions by target critic
        y = np.zeros_like(Q_prime)
        for k in range(self.minibatch_size):
            if d_batch[k]:
                y[k] = r_batch[k]
            else:
                y[k] = r_batch[k] + self.GAMMA * Q_prime[k]
        # y = r_batch + gamma * Q_prime

        checkpoint_path = "training/cp_critic.ckpt"
        checkpoint_dir = os.path.dirname(checkpoint_path)
        # Create a callback that saves the model's weights
        cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,
                                                          save_weights_only=True,
                                                          verbose=1)
        self.critic.train_on_batch([s_batch, a_batch], y)
        # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1])

        with tf.GradientTape(persistent=True) as tape:
            a = self.actor(s_batch)
            tape.watch(a)
            theta = self.actor.trainable_variables
            q = self.critic([s_batch, a])
        dq_da = tape.gradient(q, a)
        da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da)
        self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables))
        """

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(s2_batch)
            y = r_batch + self.GAMMA * self.target_critic(
                [s2_batch, target_actions])
            critic_value = self.critic([s_batch, a_batch])
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic_opt.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor(s_batch)
            q = self.critic([s_batch, actions])  # critic_value
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(q)
        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_opt.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))
        self.update_target()
        return np.mean(q)

    def policy(self, s):
        # since batch normalization is done on self.actor, it is multiplied with upper_bound
        if s.ndim == 1:
            s = s[None, :]
        action = self.actor(s) * self.upper_bound + self.ou_noise()
        action = np.clip(action, self.lower_bound, self.upper_bound)
        return action

    def train(self):
        # To store reward history of each episode
        ep_reward_list = []
        # To store average reward history of last few episodes
        avg_reward_list = []
        monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2)
        with Loop_handler(
        ) as interruption:  # to properly save even if ctrl+C is pressed
            for eps in range(self.EPISODES):
                episode_reward = 0
                s = self.env.reset()
                """
                if an env is created using the "gym.make" method, it will terminate after 200 steps
                """
                for t in range(self.MAX_TIME_STEPS):
                    # done = False
                    # while not done:
                    if self.render:
                        self.env.render()
                    a = self.policy(s)
                    s_, r, done, _ = self.env.step(a)
                    self.replay_buffer.add(np.reshape(s, (self.s_dim, )),
                                           np.reshape(a, (self.a_dim, )),
                                           r, done,
                                           np.reshape(s_, (self.s_dim, )))
                    episode_reward += r
                    if self.replay_buffer.size() > self.minibatch_size:
                        q = self.train_step()
                    s = s_.reshape(1, -1)
                    if interruption():
                        break
                ep_reward_list.append(episode_reward)
                # Mean of last 40 episodes
                avg_reward = np.mean(ep_reward_list[-40:])
                print("Episode * {} * Avg Reward is ==> {}".format(
                    eps, avg_reward))
                avg_reward_list.append(avg_reward)
                monitor.add_data(avg_reward, q)

            self.save_weights(
                save_name=self.save_name)  # if you want to save weights
            self.plot_results(avg_reward=avg_reward_list, train=True)

    def save_weights(self, save_name='final_weights'):
        self.actor.save_weights("training/%s_actor.h5" % save_name)
        self.critic.save_weights("training/%s_critic.h5" % save_name)
        self.target_actor.save_weights("training/%s_target_actor.h5" %
                                       save_name)
        self.target_critic.save_weights("training/%s_target_critic.h5" %
                                        save_name)

        # to save in other format
        self.target_actor.save_weights('training/%s_actor_weights' % save_name,
                                       save_format='tf')
        self.target_critic.save_weights('training/%s_critic_weights' %
                                        save_name,
                                        save_format='tf')
        print('Training completed and network weights saved')

    # For evaluation of the policy learned
    def collect_data(self, act_net, iterations=1000):
        a_all, states_all = [], []
        obs = self.env.reset()
        for t in range(iterations):
            obs = np.squeeze(obs)
            if obs.ndim == 1:
                a = act_net(obs[None, :])
            else:
                a = act_net(obs)
            obs, _, done, _ = self.env.step(a)
            states_all.append(obs)
            a_all.append(a)
            # self.env.render()  # Uncomment this to see the actor in action (But not in python notebook)
            # if done:
            #     break
        states = np.squeeze(
            np.array(states_all))  # cos(theta), sin(theta), theta_dot
        a_all = np.squeeze(np.array(a_all))
        return states, a_all

    def plot_results(self,
                     avg_reward=None,
                     actions=None,
                     states=None,
                     train=False,
                     title=None):
        # An additional way to visualize the avg episode rewards
        if train:
            plt.figure()
            plt.plot(avg_reward)
            plt.xlabel("Episode")
            plt.ylabel("Avg. Epsiodic Reward")
            plt.show()
        else:  # work only for Pendulum-v0 environment
            fig, ax = plt.subplots(3, sharex=True)
            theta = np.arctan2(states[:, 1], states[:, 0])
            ax[0].set_ylabel('u')
            ax[0].plot(np.squeeze(actions))
            ax[1].set_ylabel(u'$\\theta$')
            ax[1].plot(theta)
            # ax[1].plot(states[:, 0])
            ax[2].set_ylabel(u'$\omega$')
            ax[2].plot(states[:, 2])  # ang velocity
            fig.canvas.set_window_title(title)
Esempio n. 3
0
def train(sess, env, actor, critic):
    # Set up summary ops
    summary_ops, summary_vars = build_summaries()

    # Initialize Tensorflow variables
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        s = env.reset()

        episode_reward = 0
        episode_ave_max_q = 0

        noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA,
                                          MAX_STEPS_EPISODE)
        noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME)

        for j in xrange(MAX_STEPS_EPISODE):

            if RENDER_ENV:
                env.render()

            # Add exploratory noise according to Ornstein-Uhlenbeck process to action
            # Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps
            if i < EXPLORATION_TIME:
                a = actor.predict(
                    np.reshape(s,
                               (1, env.observation_space.shape[0]))) + noise[j]
            else:
                a = actor.predict(
                    np.reshape(s, (1, env.observation_space.shape[0])))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, actor.state_dim),
                              np.reshape(a, actor.action_dim), r, terminal,
                              np.reshape(s2, actor.state_dim))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    # If state is terminal assign reward only
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    # Else assgin reward + net target Q
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = \
                    critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                episode_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                a_grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, a_grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            episode_reward += r

            if terminal or j == MAX_STEPS_EPISODE - 1:
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: episode_reward,
                                           summary_vars[1]: episode_ave_max_q
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \
                      '| Qmax: %.4f' % (episode_ave_max_q / float(j))

                break
Esempio n. 4
0
class DDPG:
    def __init__(self, env, sess, low_action_bound_list,
                 high_action_bound_list):
        self.env = env
        self.sess = sess
        self.low_action_bound_list = low_action_bound_list  # depends on the env
        self.high_action_bound_list = high_action_bound_list
        self.action_range_bound = [
            hi - lo for hi, lo in zip(self.high_action_bound_list,
                                      self.low_action_bound_list)
        ]
        self.learning_rate = 0.0001  #TODO move these to configs
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 1e-6
        self.gamma = 0.99
        self.tau = 0.001
        self.buffer_size = 1000000
        self.batch_size = 128
        self.theta = 0.15
        self.ou = 0
        self.sigma = 0.3

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = len(self.low_action_bound_list
                              )  #self.env.action_space, make this into input
        self.continuous_action_space = True

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Creating ACTOR model
        actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate)
        self.actor_state_input, self.actor_model = actor_.create_actor_model()
        _, self.target_actor_model = actor_.create_actor_model()

        self.actor_critic_grad = tf.placeholder(tf.float32,
                                                [None, self.action_dim])

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output,
                                        actor_model_weights,
                                        -self.actor_critic_grad)

        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # Creating CRITIC model
        critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate)
        self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model(
        )
        _, _, self.target_critic_model = critic_.create_critic_model()

        self.critic_grads = tf.gradients(self.critic_model.output,
                                         self.critic_action_input)

        self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim)
        self.noise.reset()

        self.sess.run(tf.initialize_all_variables())

    def __repr__(self):
        return 'DDPG_gamma{}_tau{}'.format(self.gamma, self.tau)

    # TRAINING FUNCTIONS
    def train_actor(self, samples):
        current_states, actions, rewards, next_states, dones = samples

        predicted_actions = self.actor_model.predict(current_states)

        grads = self.sess.run(self.critic_grads,
                              feed_dict={
                                  self.critic_state_input: current_states,
                                  self.critic_action_input: predicted_actions
                              })[0]

        self.sess.run(self.optimize,
                      feed_dict={
                          self.actor_state_input: current_states,
                          self.actor_critic_grad: grads
                      })

        if self.epsilon - self.epsilon_decay > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        self.noise.reset()

    def train_critic(self, samples):
        current_states, actions, rewards, next_states, dones = samples

        target_actions = self.target_actor_model.predict(next_states)
        target_q_values = self.target_critic_model.predict(
            [next_states, target_actions])

        rewards = rewards + self.gamma * target_q_values * (1 - dones)

        evaluation = self.critic_model.fit([current_states, actions],
                                           rewards,
                                           verbose=0)

    def train(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)
            self.train_actor(samples)
            self.train_critic(samples)

    # TARGET MODEL UPDATES
    def update_actor_target(self):
        actor_model_weights = self.actor_model.get_weights()
        target_actor_model_weights = self.target_actor_model.get_weights()

        for i in range(len(target_actor_model_weights)):
            target_actor_model_weights[i] = actor_model_weights[
                i] * self.tau + target_actor_model_weights[i] * (1.0 -
                                                                 self.tau)
        self.target_actor_model.set_weights(target_actor_model_weights)

    def update_critic_target(self):
        critic_model_weights = self.critic_model.get_weights()
        target_critic_model_weights = self.target_critic_model.get_weights()

        for i in range(len(target_critic_model_weights)):
            target_critic_model_weights[i] = critic_model_weights[
                i] * self.tau + target_critic_model_weights[i] * (1.0 -
                                                                  self.tau)
        self.target_critic_model.set_weights(target_critic_model_weights)

    def update_target_models(self):
        self.update_actor_target()
        self.update_critic_target()

    # ACTING FUNCTION
    def act(self, current_epsiode, current_state):
        noise = self.epsilon * self.noise.generate()
        action = self.actor_model.predict(
            current_state
        ) * self.high_action_bound_list + noise  #TODO add linear mapping for affine space
        return np.clip(action, self.low_action_bound_list,
                       self.high_action_bound_list)
Esempio n. 5
0
class DDPG:
    def __init__(self, env, batch_size, mem_size, discount, actor_params,
                 critic_params):
        self._batch_size = batch_size
        self._mem_size = mem_size
        self._discount = discount
        self._sess = tensorflow.Session()
        k_backend.set_session(self._sess)
        self._env = env
        self._state_dim = env.observation_space.shape[0]
        self._action_dim = env.action_space.shape[0]
        self._action_min = env.action_space.low
        self._action_max = env.action_space.high
        self._state_min = env.observation_space.low
        self._state_max = env.observation_space.high
        self._actor = Actor(self._sess, self._state_dim, self._action_dim,
                            self._action_min, self._action_max, actor_params)
        self._critic = Critic(self._sess, 0.5, self._state_dim,
                              self._action_dim, critic_params)
        self._memory = ReplayBuffer(mem_size)

    def get_action(self, state):
        return self._actor._model.predict(state)

    def train(self):
        '''
        No training takes place until the replay buffer contains
        at least batch size number of experiences
        '''

        if (self._memory.size() > self._batch_size):
            self._train()

    def _train(self):
        states, actions, rewards, done, next_states = self._memory.sample(
            self._batch_size)
        self._train_critic(states, actions, rewards, done, next_states)
        action_gradients = self._critic.action_gradients(states, actions)
        self._actor.train(states, action_gradients)

    def q_estimate(self, state, action):
        return self._critic._model.predict(state, action)

    def _get_q_targets(self, next_states, done, rewards):
        '''
        q = r if done else =  r + gamma * qnext
        '''
        # use actor network to determine the next action under current policy
        # estimate Q values from the critic network

        actions = self.get_action(next_states)
        qnext = self.q_estimate(next_states, actions)

        q_targets = [
            reward if end else reward * self._discount * next_q
            for (reward, next_q, end) in zip(rewards, qnext, done)
        ]
        return q_targets

    def _train_critic(self, states, actions, rewards, done, next_states):
        q_targets = self._get_q_targets(next_states, done, rewards)
        self._critic.train(states, actions, q_targets)

    def experience(self, state, action, reward, done, next_state):
        # store in replay buffer
        self._memory.add(state, action, reward, done, next_state)

        self.train()
Esempio n. 6
0
class TD3:
    def __init__(self, env, sess, low_action_bound_list,
                 high_action_bound_list):
        self.env = env
        self.sess = sess
        self.low_action_bound_list = low_action_bound_list  # depends on the env
        self.high_action_bound_list = high_action_bound_list
        self.action_range_bound = [
            hi - lo for hi, lo in zip(self.high_action_bound_list,
                                      self.low_action_bound_list)
        ]
        self.learning_rate = 0.0001
        self.exploration_noise = 0.1
        self.gamma = 0.90
        self.tau = 0.01
        self.buffer_size = 10000
        self.batch_size = 128
        self.policy_noise = 0.1
        self.noise_clip = 0.05
        self.exploration_episodes = 10
        # self.policy_freq = 2

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = len(self.low_action_bound_list
                              )  #self.env.action_space, make this into input
        self.continuous_action_space = True

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Creating ACTOR model
        actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate)
        self.actor_state_input, self.actor_model = actor_.create_actor_model()
        _, self.target_actor_model = actor_.create_actor_model()

        self.actor_critic_grad = tf.placeholder(tf.float32,
                                                [None, self.action_dim])

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output,
                                        actor_model_weights,
                                        -self.actor_critic_grad)

        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # Creating FIRST CRITIC model, this is the one we train/optimize against
        critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate)
        self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model(
        )
        self.critic_model.compile(optimizer=Adam(lr=critic_.learning_rate),
                                  loss='')

        _, _, self.target_critic_model = critic_.create_critic_model()
        self.target_critic_model.compile(
            optimizer=Adam(lr=critic_.learning_rate), loss='')

        self.critic_grads = tf.gradients(self.critic_model.output[0],
                                         self.critic_action_input)

        self.sess.run(tf.initialize_all_variables())

    def __repr__(self):
        return 'TD3_gamma{}_tau{}'.format(self.gamma, self.tau)

    # TRAINING FUNCTIONS
    def train_actor(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)

            current_states, actions, rewards, next_states, dones = samples

            predicted_actions = self.actor_model.predict(
                current_states
            ) * self.high_action_bound_list  #TODO create linear mapping for affine space

            grads = self.sess.run(self.critic_grads,
                                  feed_dict={
                                      self.critic_state_input: current_states,
                                      self.critic_action_input:
                                      predicted_actions
                                  })[0]

            self.sess.run(self.optimize,
                          feed_dict={
                              self.actor_state_input: current_states,
                              self.actor_critic_grad: grads
                          })

    def train_critic(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)

            current_states, actions, rewards, next_states, dones = samples

            target_actions = self.target_actor_model.predict(
                next_states) * self.high_action_bound_list

            # CCOMPUTING FIRST CRITIC
            # introduce area of noise to action for smoothing purposes
            noise = np.random.normal(
                size=len(self.action_range_bound)) * self.policy_noise
            clipped_noise = np.clip(noise, -self.noise_clip, self.noise_clip)

            # added above noise to target_actions and clip to be in range of valid actions
            target_actions = np.clip((target_actions + clipped_noise),
                                     self.low_action_bound_list,
                                     self.high_action_bound_list)
            target_q1_values, target_q2_values = self.target_critic_model.predict(
                [
                    next_states, target_actions,
                    np.random.rand(self.batch_size, 1)
                ])

            target_q_values = np.minimum(target_q1_values, target_q2_values)

            target_q = rewards + self.gamma * target_q_values * (1 - dones)

            # current_q1, current_q2 = self.critic_model.predict([current_states, actions, np.random.rand(self.batch_size, 1)])

            history = self.critic_model.fit(
                [current_states, actions, target_q], verbose=0)
            # print('Loss: ',history.history['loss'])

    def train(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)
            self.train_actor()
            self.train_critic()

    # TARGET MODEL UPDATES
    def update_actor_target(self):
        actor_model_weights = self.actor_model.get_weights()
        target_actor_model_weights = self.target_actor_model.get_weights()

        for i in range(len(target_actor_model_weights)):
            target_actor_model_weights[i] = actor_model_weights[
                i] * self.tau + target_actor_model_weights[i] * (1.0 -
                                                                 self.tau)
        self.target_actor_model.set_weights(target_actor_model_weights)

    def update_critic_target(self):
        critic_model_weights = self.critic_model.get_weights()
        target_critic_model_weights = self.target_critic_model.get_weights()

        for i in range(len(target_critic_model_weights)):
            target_critic_model_weights[i] = critic_model_weights[
                i] * self.tau + target_critic_model_weights[i] * (1.0 -
                                                                  self.tau)
        self.target_critic_model.set_weights(target_critic_model_weights)

    def update_target_models(self):
        self.update_actor_target()
        self.update_critic_target()

    # ACTING FUNCTION with epsilon greedy
    def act(self, current_epsiode, current_state):
        if current_epsiode < self.exploration_episodes:
            return np.random.uniform(
                self.low_action_bound_list,
                self.high_action_bound_list) * self.high_action_bound_list
        else:
            action = self.actor_model.predict(
                current_state) * self.high_action_bound_list + np.random.normal(
                    0, [
                        self.exploration_noise * hi
                        for hi in self.high_action_bound_list
                    ])
            return np.clip(action, self.low_action_bound_list,
                           self.high_action_bound_list)