예제 #1
0
    def __init__(self, critic=None, actor=None):
        self.env = gym.make('CartPole-v0')

        self.actor = Actor(model=actor) if critic else \
            Actor.init_model(2, self.env.observation_space.shape[0],
                             64, self.env.action_space.n)

        self.critic = Critic(model=critic) if critic else \
            Critic.init_model(2, self.env.observation_space.shape[0]
                              + self.env.action_space.n, 64)

        self.actor_variables = self.actor.model.trainable_variables
        self.critic_variables = self.critic.model.trainable_variables

        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
        self.critic_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

        self.actions_dim = self.env.action_space.n
        self.discount_factor = 0.99
        self.e = 0.01
        self.episode_length = 0

        self.states = []
        self.rewards = []
        self.gradients = []
        self.actions = []
예제 #2
0
def test_critic_B():
    length = 100000
    val_1 = 0.3
    val_2 = 0.5

    critic = Critic.init_model(2, 2, 100)
    critic.model.compile(optimizer='adam',
                         loss=tf.keras.losses.MeanSquaredError())

    x_train = []
    y_train = []
    for i in range(length):
        if random() > 0.5:
            x_train.append([1, 0])
            y_train.append(np.random.normal(val_1, 0.5, 1)[0])
        else:
            x_train.append([0, 1])
            y_train.append(np.random.normal(val_2, 0.5, 1)[0])

    all = list(zip(x_train, y_train))
    shuffle(all)

    x_train = [a for a, _ in all]
    y_train = [b for _, b in all]

    critic.model.fit(x_train, y_train, epochs=5)

    r = critic.get_q(np.array([1, 0]))
    print('estimate:', r.numpy()[0][0],
          'accuracy: ', abs(r - val_1).numpy()[0][0])
    r = critic.get_q(np.array([0, 1]))
    print('estimate:', r.numpy()[0][0],
          'accuracy: ', abs(r - val_2).numpy()[0][0])
예제 #3
0
def test_critic_A():
    length = 200000
    val = 0.3

    critic = Critic.init_model(2, 4, 100)
    critic.model.compile(optimizer='adam',
                         loss=tf.keras.losses.MeanSquaredError())

    x_train = np.array([[0.7, 0.1, 0, 0.5]]*length)
    y_train = np.random.normal(val, 1, length)

    critic.model.fit(x_train, y_train, epochs=5)

    r = critic.get_q(np.array([0.7, 0.1, 0, 0.5]))
    print('estimate:', r.numpy()[0][0])
    assert abs(r - val) < 0.01
예제 #4
0
    def __init__(self, tau=0.05, burn_in_eps=30, critic=None, actor=None):
        self.env = gym.make('LunarLanderContinuous-v2')
        self.memory = Memory(batch_size=120)
        self.tau = tau
        self.burn_in_eps = burn_in_eps
        self.eps = 0
        self.actions_dim = self.env.action_space.shape[0]

        self.high_action = 1
        self.low_action = -1

        self.discount_factor = 0.99
        self.episode_length = 0
        self.actor_learning_rate = 0.00001
        self.critic_learning_rate = 0.00001
        self.exploration_value = 0.2

        self.actor = ContinuousActor(model=actor) if actor else \
            ContinuousActor.init_model(2, self.env.observation_space.shape[0],
                                       400, self.env.action_space.shape[0])

        self.critic = Critic(model=critic) if critic else \
            Critic.init_model(2, self.env.observation_space.shape[0]
                              + self.env.action_space.shape[0], 400)

        if actor:
            self.target_actor = ContinuousActor(model=actor)
        else:
            self.target_actor = ContinuousActor.init_model(
                2, self.env.observation_space.shape[0], 400,
                self.env.action_space.shape[0])
            self.target_actor.model.set_weights(self.actor.model.get_weights())

        if critic:
            self.target_critic = Critic(model=critic)
        else:
            self.target_critic = Critic.init_model(
                2, self.env.observation_space.shape[0] +
                self.env.action_space.shape[0], 400)
            self.target_critic.model\
                .set_weights(self.critic.model.get_weights())

        self.actor_variables = self.actor.model.trainable_variables
        self.critic_variables = self.critic.model.trainable_variables

        self.actor_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.actor_learning_rate)
        self.critic_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.critic_learning_rate)
예제 #5
0
class Trainer:
    def __init__(self, critic=None):
        self.env = gym.make('CartPole-v0')
        self.Q = Critic(model=critic) if critic else \
            Critic.init_model(2, self.env.observation_space.shape[0]
                              + self.env.action_space.n, 64)

        self.opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
        self.actions_dim = self.env.action_space.n
        self.variables = self.Q.model.trainable_variables
        self.discount_factor = 0.99
        self.e = 0.01
        self.episode_length = 0

        self.states = []
        self.rewards = []
        self.gradients = []
        self.actions = []

    def record_episode(self, iterations):
        done = False
        self.episode_length = 0
        state = self.env.reset()
        iterations = 200 if iterations is None else iterations

        while not done or self.episode_length > iterations:
            self.episode_length += 1

            with tf.GradientTape() as tape:
                action_Qs = self.get_action_vals(state)
                action = np.argmax(action_Qs)
                Q_estimate = action_Qs[action]

                state, reward, done, _ = self.env.step(action)

                reward = 1 if not done else -1
                future_Q = max(tf.stop_gradient(self.get_action_vals(state)))
                final_state = 0 if done else 1
                target = reward + self.discount_factor * future_Q * final_state
                q_loss = tf.math.pow(Q_estimate - target, 2)

            grads = tape.gradient(q_loss, self.variables)

            # print('--------------------------------------')
            # print('done: ', done)
            # print('reward: ', reward)
            # print('delta:', delta.numpy()[0][0])
            # print('Q_estimate:', Q_estimate.numpy()[0][0])
            # print('future_Q:', future_Q.numpy()[0][0])

            # grads = [grad*delta[0] for grad in grads]
            self.opt.apply_gradients(zip(grads, self.variables))

        self.env.close()
        return self.episode_length

    def get_action_vals(self, state):
        Q = []
        all_actions_one_hot = tf.one_hot(list(range(self.actions_dim)),
                                         self.actions_dim)
        for one_hot_action in all_actions_one_hot:
            Q_a = self.Q.get_Q(np.array(state), one_hot_action)
            Q.append(Q_a)
        return Q
예제 #6
0
def test_critic():
    critic = Critic.init_model(2, 4, 100)
    one_hot_action = tf.one_hot([1], 2)
    Q = critic.get_V(np.array([0, 0]), one_hot_action)
    print(Q)
예제 #7
0
class Trainer:
    def __init__(self, critic=None, actor=None):
        self.env = gym.make('CartPole-v0')

        self.actor = Actor(model=actor) if critic else \
            Actor.init_model(2, self.env.observation_space.shape[0],
                             64, self.env.action_space.n)

        self.critic = Critic(model=critic) if critic else \
            Critic.init_model(2, self.env.observation_space.shape[0]
                              + self.env.action_space.n, 64)

        self.actor_variables = self.actor.model.trainable_variables
        self.critic_variables = self.critic.model.trainable_variables

        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
        self.critic_opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

        self.actions_dim = self.env.action_space.n
        self.discount_factor = 0.99
        self.e = 0.01
        self.episode_length = 0

        self.states = []
        self.rewards = []
        self.gradients = []
        self.actions = []

    def record_episode(self, iterations):
        done = False
        self.episode_length = 0
        state = self.env.reset()
        iterations = 200 if iterations is None else iterations

        while not done or self.episode_length > iterations:
            self.episode_length += 1

            with tf.GradientTape() as actor_tape, \
                    tf.GradientTape() as critic_tape:
                policy = self.actor.get_policy(state)
                action = self.sample_action(policy)
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=policy, labels=np.array([action]))

                # quality of this action
                q_val = self.get_action_q_value(state, action)
                # q_diff = tf.stop_gradient(self.get_future_Q(state))

                state, reward, done, _ = self.env.step(action)

                reward = 1 if not done else -1
                future_Q = tf.stop_gradient(self.get_future_Q(state))
                finished = 0 if done else 1
                target = reward + self.discount_factor * future_Q * finished
                temp_diff = q_val - target
                # q_loss = tf.math.pow(temp_diff, 2)

                # if done:
                print('target:', target.numpy()[0])
                print('temp_diff:', temp_diff.numpy()[0])
                print('q_val:', q_val.numpy()[0])

            actor_grads = actor_tape.gradient(loss, self.actor_variables)
            critic_grads = critic_tape.gradient(q_val, self.critic_variables)

            # first update the actor model with the q_val as weight.
            actor_grads = [grad * q_val[0] for grad in actor_grads]
            self.actor_opt \
                .apply_gradients(zip(actor_grads, self.actor_variables))

            # second update the critic model with the target val:
            # reward = 1 if not done else -1
            # future_Q = self.get_future_Q(state)
            # final_state = 1 if done else 0
            # target = reward + self.discount_factor * future_Q * final_state
            # target_delta = q_val - target
            critic_grads = [grad * temp_diff[0] for grad in critic_grads]
            self.critic_opt \
                .apply_gradients(zip(critic_grads,
                                     self.critic_variables))

            # print('--------------------------------------')
            # print('done: ', done)
            # print('reward: ', reward)
            # print('delta:', target_delta.numpy()[0][0])
            # print('Q_estimate:', q_val.numpy()[0][0])
            # print('future_Q:', future_Q.numpy()[0][0])

        self.env.close()
        return self.episode_length

    def sample_action(self, policy):
        if np.random.rand(1) < self.e:
            action = self.env.action_space.sample()
        else:
            soft_max_prob = tf.nn.softmax(policy)
            action = np.random \
                .choice([0, 1], p=soft_max_prob.numpy()[0])
        return action

    def get_action_q_value(self, state, action):
        one_hot_action = tf.one_hot(action, self.actions_dim)
        return self.critic.get_Q(np.array(state), one_hot_action)

    def get_q_values(self, state):
        action_Qs = []
        all_actions_one_hot = tf.one_hot(list(range(self.actions_dim)),
                                         self.actions_dim)
        for one_hot_action in all_actions_one_hot:
            Q_a = self.critic.get_Q(np.array(state), one_hot_action)
            action_Qs.append(Q_a)
        return action_Qs

    def get_future_Q(self, state):
        action_Qs = []
        all_actions_one_hot = tf.one_hot(list(range(self.actions_dim)),
                                         self.actions_dim)
        for one_hot_action in all_actions_one_hot:
            Q_a = self.critic.get_Q(np.array(state), one_hot_action)
            action_Qs.append(Q_a)
        return max(action_Qs)
예제 #8
0
class Trainer:
    def __init__(self, tau=0.05, burn_in_eps=30, critic=None, actor=None):
        self.env = gym.make('LunarLanderContinuous-v2')
        self.memory = Memory(batch_size=120)
        self.tau = tau
        self.burn_in_eps = burn_in_eps
        self.eps = 0
        self.actions_dim = self.env.action_space.shape[0]

        self.high_action = 1
        self.low_action = -1

        self.discount_factor = 0.99
        self.episode_length = 0
        self.actor_learning_rate = 0.00001
        self.critic_learning_rate = 0.00001
        self.exploration_value = 0.2

        self.actor = ContinuousActor(model=actor) if actor else \
            ContinuousActor.init_model(2, self.env.observation_space.shape[0],
                                       400, self.env.action_space.shape[0])

        self.critic = Critic(model=critic) if critic else \
            Critic.init_model(2, self.env.observation_space.shape[0]
                              + self.env.action_space.shape[0], 400)

        if actor:
            self.target_actor = ContinuousActor(model=actor)
        else:
            self.target_actor = ContinuousActor.init_model(
                2, self.env.observation_space.shape[0], 400,
                self.env.action_space.shape[0])
            self.target_actor.model.set_weights(self.actor.model.get_weights())

        if critic:
            self.target_critic = Critic(model=critic)
        else:
            self.target_critic = Critic.init_model(
                2, self.env.observation_space.shape[0] +
                self.env.action_space.shape[0], 400)
            self.target_critic.model\
                .set_weights(self.critic.model.get_weights())

        self.actor_variables = self.actor.model.trainable_variables
        self.critic_variables = self.critic.model.trainable_variables

        self.actor_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.actor_learning_rate)
        self.critic_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.critic_learning_rate)

    def run_episode(self):
        done = False
        self.eps = self.eps + 1
        self.episode_length = 0
        state = self.env.reset()
        reward_sum = 0
        success_count = 0
        while not done:
            self.episode_length += 1
            action = self.actor.get_action(state)
            action = action + tf.random\
                .normal([2], mean=0.0,
                        stddev=self.exploration_value,
                        dtype=tf.dtypes.float64)
            action = tf.clip_by_value(action,
                                      clip_value_min=-self.low_action,
                                      clip_value_max=self.high_action)
            next_state, reward, done, _ = self.env.step(action)
            self.memory.remember(state, action, reward, done, next_state)

            reward_sum = reward_sum + reward
            success = self.train()
            if success:
                success_count = success_count + 1
            state = next_state

        self.env.close()
        self.memory.check()
        return reward_sum/self.episode_length, \
            success_count/self.episode_length, \
            self.episode_length

    def train(self):
        success = True
        if self.memory.full():
            success = False
            states, actions, rewards, done, next_states = self.memory.sample()

            with tf.GradientTape() as actor_tape, \
                    tf.GradientTape() as critic_tape:
                Q_loss = self\
                    .Q_loss(states, actions, rewards, next_states, done)
                if self.eps > self.burn_in_eps:
                    action_loss = tf.math.negative(self.action_loss(states))

            if self.eps > self.burn_in_eps:
                actor_grads = actor_tape\
                    .gradient(action_loss, self.actor_variables)
                self.actor_opt \
                    .apply_gradients(zip(actor_grads, self.actor_variables))

            critic_grads = critic_tape\
                .gradient(Q_loss, self.critic_variables)
            self.critic_opt \
                .apply_gradients(zip(critic_grads,
                                 self.critic_variables))

            if self.eps > self.burn_in_eps:
                updated_action_loss = tf.math\
                    .negative(self.action_loss(states)).numpy()
                if updated_action_loss > action_loss:
                    success = True

            self.target_actor.track_weights(self.tau, self.actor.model)
            self.target_critic.track_weights(self.tau, self.critic.model)
        return success

    def Q_loss(self, states, actions, rewards, next_states, done):
        next_actions = self.target_actor.model(next_states)
        Q_input = tf.concat([next_states, next_actions], axis=1)
        y = rewards[:, None] + self.discount_factor*(1-done)*self\
            .target_critic.model(Q_input)
        Q_input = tf.concat([states, actions], axis=1)
        td_error = tf.stop_gradient(y) - self.critic.model(Q_input)
        squared_error = tf.pow(td_error, 2)
        return tf.reduce_mean(squared_error)

    def action_loss(self, states):
        actions = self.actor.model(states)
        Q_input = tf.concat([states, actions], axis=1)
        mean = tf.reduce_mean(self.critic.model(Q_input))
        return mean
예제 #9
0
    def __init__(self,
                 tau=0.005,
                 burn_in_eps=0,
                 critics=[None, None],
                 actor=None):
        self.env = gym.make('LunarLanderContinuous-v2')
        self.memory = Memory(batch_size=64)
        self.tau = tau
        self.burn_in_eps = burn_in_eps
        self.eps = 0
        self.actions_dim = self.env.action_space.shape[0]
        self.discount_factor = 0.999
        self.episode_length = 0
        self.actor_learning_rate = 0.00001
        self.critic_learning_rate = 0.0001
        self.exploration_value = 0.2
        self.smoothing_var = 0.05
        self.clipping_val = 0.4
        self.low_action = -1
        self.high_action = 1
        self.policy_freq = 4
        self.actor_hidden_layers = 2
        self.critic_hidden_layers = 2
        self.layer_units = 200
        self.max_ep_steps = 300

        self.actor = ContinuousActor(model=actor) if actor else \
            ContinuousActor.init_model(self.actor_hidden_layers,
                                       self.env.observation_space.shape[0],
                                       self.layer_units,
                                       self.env.action_space.shape[0])

        self.critic_1 = Critic(model=critics[0]) if all(critics) else \
            Critic.init_model(self.critic_hidden_layers,
                              self.env.observation_space.shape[0]
                              + self.env.action_space.shape[0],
                              self.layer_units)

        self.critic_2 = Critic(model=critics[1]) if all(critics) else \
            Critic.init_model(self.critic_hidden_layers,
                              self.env.observation_space.shape[0]
                              + self.env.action_space.shape[0],
                              self.layer_units)

        self.target_actor = ContinuousActor \
            .init_model(self.actor_hidden_layers,
                        self.env.observation_space.shape[0],
                        self.layer_units, self.env.action_space.shape[0])
        self.target_actor.model.set_weights(self.actor.model.get_weights())

        self.target_critic_1 = Critic \
            .init_model(self.critic_hidden_layers,
                        self.env.observation_space.shape[0]
                        + self.env.action_space.shape[0], self.layer_units)
        self.target_critic_1.model\
            .set_weights(self.critic_1.model.get_weights())

        self.target_critic_2 = Critic \
            .init_model(self.critic_hidden_layers,
                        self.env.observation_space.shape[0]
                        + self.env.action_space.shape[0], self.layer_units)
        self.target_critic_2.model\
            .set_weights(self.critic_2.model.get_weights())

        self.actor_variables = self.actor.model.trainable_variables
        self.critic_1_variables = self.critic_1.model.trainable_variables
        self.critic_2_variables = self.critic_2.model.trainable_variables

        self.actor_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.actor_learning_rate)
        self.critic_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.critic_learning_rate)
예제 #10
0
class Trainer:
    def __init__(self,
                 tau=0.005,
                 burn_in_eps=0,
                 critics=[None, None],
                 actor=None):
        self.env = gym.make('LunarLanderContinuous-v2')
        self.memory = Memory(batch_size=64)
        self.tau = tau
        self.burn_in_eps = burn_in_eps
        self.eps = 0
        self.actions_dim = self.env.action_space.shape[0]
        self.discount_factor = 0.999
        self.episode_length = 0
        self.actor_learning_rate = 0.00001
        self.critic_learning_rate = 0.0001
        self.exploration_value = 0.2
        self.smoothing_var = 0.05
        self.clipping_val = 0.4
        self.low_action = -1
        self.high_action = 1
        self.policy_freq = 4
        self.actor_hidden_layers = 2
        self.critic_hidden_layers = 2
        self.layer_units = 200
        self.max_ep_steps = 300

        self.actor = ContinuousActor(model=actor) if actor else \
            ContinuousActor.init_model(self.actor_hidden_layers,
                                       self.env.observation_space.shape[0],
                                       self.layer_units,
                                       self.env.action_space.shape[0])

        self.critic_1 = Critic(model=critics[0]) if all(critics) else \
            Critic.init_model(self.critic_hidden_layers,
                              self.env.observation_space.shape[0]
                              + self.env.action_space.shape[0],
                              self.layer_units)

        self.critic_2 = Critic(model=critics[1]) if all(critics) else \
            Critic.init_model(self.critic_hidden_layers,
                              self.env.observation_space.shape[0]
                              + self.env.action_space.shape[0],
                              self.layer_units)

        self.target_actor = ContinuousActor \
            .init_model(self.actor_hidden_layers,
                        self.env.observation_space.shape[0],
                        self.layer_units, self.env.action_space.shape[0])
        self.target_actor.model.set_weights(self.actor.model.get_weights())

        self.target_critic_1 = Critic \
            .init_model(self.critic_hidden_layers,
                        self.env.observation_space.shape[0]
                        + self.env.action_space.shape[0], self.layer_units)
        self.target_critic_1.model\
            .set_weights(self.critic_1.model.get_weights())

        self.target_critic_2 = Critic \
            .init_model(self.critic_hidden_layers,
                        self.env.observation_space.shape[0]
                        + self.env.action_space.shape[0], self.layer_units)
        self.target_critic_2.model\
            .set_weights(self.critic_2.model.get_weights())

        self.actor_variables = self.actor.model.trainable_variables
        self.critic_1_variables = self.critic_1.model.trainable_variables
        self.critic_2_variables = self.critic_2.model.trainable_variables

        self.actor_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.actor_learning_rate)
        self.critic_opt = tf.keras.optimizers\
            .Adam(learning_rate=self.critic_learning_rate)

    def run_episode(self):
        done = False
        self.eps = self.eps + 1
        self.episode_length = 0
        state = self.env.reset()
        reward_sum = 0

        while not done and self.episode_length < self.max_ep_steps:
            self.episode_length += 1
            action = self.actor.get_action(state)
            action = action + tf.random\
                .normal([2], mean=0.0,
                        stddev=self.exploration_value,
                        dtype=tf.dtypes.float64)
            action = tf.clip_by_value(action,
                                      clip_value_min=self.low_action,
                                      clip_value_max=self.high_action)
            next_state, reward, done, _ = self.env.step(action)
            self.memory.remember(state, action, reward, done, next_state)

            reward_sum = reward_sum + reward
            self.train()
            state = next_state
        self.env.close()

        return reward_sum, self.episode_length

    def train(self):
        if self.memory.full():
            states, actions, rewards, done, next_states = self.memory.sample()

            with tf.GradientTape() as actor_tape, \
                    tf.GradientTape() as critic_tape_1, \
                    tf.GradientTape() as critic_tape_2:
                y = self.compute_target(states, actions, rewards, next_states,
                                        done)

                Q_input = tf.concat([states, actions], axis=1)

                td_err_1 = tf.stop_gradient(y) - self.critic_1.model(Q_input)
                squared_error_1 = tf.pow(td_err_1, 2)
                Q_loss_1 = tf.reduce_mean(squared_error_1)

                td_err_2 = tf.stop_gradient(y) - self.critic_2.model(Q_input)
                squared_error_2 = tf.pow(td_err_2, 2)
                Q_loss_2 = tf.reduce_mean(squared_error_2)

                if self.update_policy:
                    action_loss = tf.math.negative(self.action_loss(states))

            critic_1_grads = critic_tape_1\
                .gradient(Q_loss_1, self.critic_1_variables)
            critic_2_grads = critic_tape_2\
                .gradient(Q_loss_2, self.critic_2_variables)

            self.critic_opt \
                .apply_gradients(zip(critic_1_grads,
                                 self.critic_1_variables))
            self.critic_opt \
                .apply_gradients(zip(critic_2_grads,
                                 self.critic_2_variables))

            if self.update_policy:
                actor_grads = actor_tape\
                    .gradient(action_loss, self.actor_variables)
                self.actor_opt \
                    .apply_gradients(zip(actor_grads, self.actor_variables))

                self.target_actor.track_weights(self.tau, self.actor.model)
                self.target_critic_1 \
                    .track_weights(self.tau, self.critic_1.model)
                self.target_critic_2 \
                    .track_weights(self.tau, self.critic_2.model)

            if self.update_policy:
                return Q_loss_1.numpy(), Q_loss_2.numpy(), action_loss
            else:
                return Q_loss_1.numpy(), Q_loss_2.numpy(), 0
        return 0, 0, 0

    @property
    def update_policy(self):
        return self.eps > self.burn_in_eps and \
                self.episode_length % self.policy_freq

    def compute_target(self, states, actions, rewards, next_states, done):
        next_actions = self.target_actor.model(next_states)
        smoothing_noise = tf.random\
            .normal(actions.shape, mean=0.0,
                    stddev=self.smoothing_var,
                    dtype=tf.dtypes.float64)
        clipped_smoothing_noise = \
            tf.clip_by_value(smoothing_noise,
                             clip_value_min=-self.clipping_val,
                             clip_value_max=self.clipping_val)
        next_actions = tf.clip_by_value(clipped_smoothing_noise + next_actions,
                                        clip_value_min=self.low_action,
                                        clip_value_max=self.high_action)

        Q_input = tf.concat([next_states, next_actions], axis=1)
        Q_1_val = self.target_critic_1.model(Q_input)
        Q_2_val = self.target_critic_2.model(Q_input)
        Q_val = tf.math.minimum(Q_1_val, Q_2_val)
        y = rewards[:, None] + self.discount_factor * (1 - done) * Q_val
        return y

    def action_loss(self, states):
        actions = self.actor.model(states)
        Q_input = tf.concat([states, actions], axis=1)
        mean = tf.reduce_mean(self.critic_1.model(Q_input))
        return mean