Esempio n. 1
0
def _save_experience(experience: Experience, memory: Memory,
                     next_state: np.ndarray):
    """
    Save a new experience by replaced next state of agent
    :param next_state: nest state of agent
    :param experience: Experience of made action
    :param memory: memory of robot with all saved experiences
    :return: next state which agent will make
    """
    experience = experience._replace(next_state=next_state)
    memory.add(experience)
Esempio n. 2
0
File: ddpg.py Progetto: dkd58/DeepRL
class DDPG(Agent):
    def __init__(self, env, monitor_path: str, **usercfg) -> None:
        super(DDPG, self).__init__(**usercfg)
        self.env = env
        self.monitor_path: str = monitor_path

        self.config.update(
            n_episodes=100000,
            n_timesteps=env.spec.tags.get(
                "wrapper_config.TimeLimit.max_episode_steps"),
            actor_learning_rate=1e-4,
            critic_learning_rate=1e-3,
            ou_theta=0.15,
            ou_sigma=0.2,
            gamma=0.99,
            batch_size=64,
            tau=0.001,
            l2_loss_coef=1e-2,
            n_actor_layers=2,
            n_hidden_units=64,
            actor_layer_norm=True,
            critic_layer_norm=
            False,  # Batch norm for critic does not seem to work
            replay_buffer_size=1e6,
            replay_start_size=
            10000  # Required number of replay buffer entries to start training
        )
        self.config.update(usercfg)

        self.state_shape: list = list(env.observation_space.shape)
        self.n_actions: int = env.action_space.shape[0]
        self.states = tf.placeholder(tf.float32, [None] + self.state_shape,
                                     name="states")
        self.actions_taken = tf.placeholder(tf.float32, [None, self.n_actions],
                                            name="actions_taken")
        self.critic_target = tf.placeholder(tf.float32, [None, 1],
                                            name="critic_target")
        self.is_training = tf.placeholder(tf.bool, name="is_training")

        with tf.variable_scope("actor"):
            self.action_output, self.actor_vars = self.build_actor_network()

        self.target_action_output, actor_target_update = self.build_target_actor_network(
            self.actor_vars)

        self.q_gradient_input = tf.placeholder("float", [None, self.n_actions],
                                               name="q_grad_input")
        self.actor_policy_gradients = tf.gradients(self.action_output,
                                                   self.actor_vars,
                                                   -self.q_gradient_input,
                                                   name="actor_gradients")
        self.actor_train_op = tf.train.AdamOptimizer(
            self.config["actor_learning_rate"],
            name="actor_optimizer").apply_gradients(
                list(zip(self.actor_policy_gradients, self.actor_vars)))

        with tf.variable_scope("critic"):
            self.q_value_output, self.critic_vars = self.build_critic_network()

        self.target_q_value_output, critic_target_update = self.build_target_critic_network(
            self.critic_vars)

        l2_loss = tf.add_n([
            self.config["l2_loss_coef"] * tf.nn.l2_loss(var)
            for var in self.critic_vars
        ])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.critic_target - self.q_value_output)) + l2_loss
        self.critic_train_op = tf.train.AdamOptimizer(
            self.config["critic_learning_rate"],
            name="critic_optimizer").minimize(self.critic_loss)
        self.action_gradients = tf.gradients(self.q_value_output,
                                             self.actions_taken,
                                             name="action_gradients")

        summaries = []
        for v in self.actor_vars + self.critic_vars:
            summaries.append(tf.summary.histogram(v.name, v))
        self.model_summary_op = tf.summary.merge(summaries)

        self.update_targets_op = tf.group(actor_target_update,
                                          critic_target_update,
                                          name="update_targets")

        self.init_op = tf.global_variables_initializer()

        self.action_noise = OrnsteinUhlenbeckActionNoise(
            self.n_actions, self.config["ou_sigma"], self.config["ou_theta"])

        self.replay_buffer = Memory(int(self.config["replay_buffer_size"]))

        self.n_updates = 0

        self.summary_writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"),
            tf.get_default_graph())

    def build_actor_network(self):
        layer1_size = 400
        layer2_size = 300

        x = self.states
        if self.config["actor_layer_norm"]:
            x = batch_norm_layer(x,
                                 training_phase=self.is_training,
                                 scope_bn="batch_norm_0",
                                 activation=tf.identity)
        with tf.variable_scope("L1"):
            x, l1_vars = linear_fan_in(x, layer1_size)
            if self.config["actor_layer_norm"]:
                x = batch_norm_layer(x,
                                     training_phase=self.is_training,
                                     scope_bn="batch_norm_1",
                                     activation=tf.nn.relu)
        with tf.variable_scope("L2"):
            x, l2_vars = linear_fan_in(x, layer2_size)
            if self.config["actor_layer_norm"]:
                x = batch_norm_layer(x,
                                     training_phase=self.is_training,
                                     scope_bn="batch_norm_2",
                                     activation=tf.nn.relu)

        with tf.variable_scope("L3"):
            W3 = tf.Variable(tf.random_uniform([layer2_size, self.n_actions],
                                               -3e-3, 3e-3),
                             name="w")
            b3 = tf.Variable(tf.random_uniform([self.n_actions], -3e-3, 3e-3),
                             name="b")
            action_output = tf.tanh(tf.nn.xw_plus_b(x, W3, b3))
            l3_vars = [W3, b3]

        return action_output, l1_vars + l2_vars + l3_vars

    def build_target_actor_network(self, actor_vars: list):
        ema = tf.train.ExponentialMovingAverage(decay=1 - self.config["tau"])
        target_update = ema.apply(actor_vars)
        target_net = [ema.average(v) for v in actor_vars]

        x = self.states
        if self.config["actor_layer_norm"]:
            x = batch_norm_layer(x,
                                 training_phase=self.is_training,
                                 scope_bn="target_batch_norm_0",
                                 activation=tf.identity)

        x = tf.nn.xw_plus_b(x, target_net[0], target_net[1])
        if self.config["actor_layer_norm"]:
            x = batch_norm_layer(x,
                                 training_phase=self.is_training,
                                 scope_bn="target_batch_norm_1",
                                 activation=tf.nn.relu)
        x = tf.nn.xw_plus_b(x, target_net[2], target_net[3])
        if self.config["actor_layer_norm"]:
            x = batch_norm_layer(x,
                                 training_phase=self.is_training,
                                 scope_bn="target_batch_norm_2",
                                 activation=tf.nn.relu)

        action_output = tf.tanh(
            tf.nn.xw_plus_b(x, target_net[4], target_net[5]))

        return action_output, target_update

    def build_critic_network(self):
        layer1_size = 400
        layer2_size = 300

        x = self.states
        with tf.variable_scope("L1"):
            if self.config[
                    "critic_layer_norm"]:  # Defaults to False (= don't use it)
                x = batch_norm_layer(x,
                                     training_phase=self.is_training,
                                     scope_bn="batch_norm_0",
                                     activation=tf.identity)
            x, l1_vars = linear_fan_in(x, layer1_size)
            x = tf.nn.relu(x)
        with tf.variable_scope("L2"):
            W2 = tf.get_variable(
                "w", [layer1_size, layer2_size],
                initializer=fan_in_initializer(layer1_size + self.n_actions))
            W2_action = tf.get_variable(
                "w_action", [self.n_actions, layer2_size],
                initializer=fan_in_initializer(layer1_size + self.n_actions))
            b2 = tf.get_variable(
                "b", [layer2_size],
                initializer=fan_in_initializer(layer1_size + self.n_actions))
            x = tf.nn.relu(
                tf.matmul(x, W2) + tf.matmul(self.actions_taken, W2_action) +
                b2)
        with tf.variable_scope("L3"):
            W3 = tf.Variable(tf.random_uniform([layer2_size, 1], -3e-3, 3e-3),
                             name="w")
            b3 = tf.Variable(tf.random_uniform([1], -3e-3, 3e-3), name="b")
            q_value_output = tf.nn.xw_plus_b(x, W3, b3, name="q_value")

        return q_value_output, l1_vars + [W2, W2_action, b2, W3, b3]

    def build_target_critic_network(self, critic_vars: list):

        ema = tf.train.ExponentialMovingAverage(decay=1 - self.config["tau"])
        target_update = ema.apply(critic_vars)
        target_net = [ema.average(v) for v in critic_vars]

        x = self.states
        if self.config["critic_layer_norm"]:
            x = batch_norm_layer(x,
                                 training_phase=self.is_training,
                                 scope_bn="batch_norm_0",
                                 activation=tf.identity)
        x = tf.nn.relu(tf.nn.xw_plus_b(x, target_net[0], target_net[1]))
        x = tf.nn.relu(
            tf.matmul(x, target_net[2]) +
            tf.matmul(self.actions_taken, target_net[3]) + target_net[4])
        q_value_output = tf.nn.xw_plus_b(x, target_net[5], target_net[6])

        return q_value_output, target_update

    def actor_gradients(self, state_batch: np.ndarray,
                        action_batch: np.ndarray):
        q, grads = tf.get_default_session().run(
            [self.q_value_output, self.action_gradients],
            feed_dict={
                self.states: state_batch,
                self.actions_taken: action_batch,
                self.is_training: False
            })
        summary = tf.Summary()
        summary.value.add(tag="model/actor_loss",
                          simple_value=float(-np.mean(q)))
        self.summary_writer.add_summary(summary, self.n_updates)
        return grads[0]

    def target_q(self, states: np.ndarray, actions: np.ndarray):
        return tf.get_default_session().run(self.target_q_value_output,
                                            feed_dict={
                                                self.states: states,
                                                self.actions_taken: actions,
                                                self.is_training: False
                                            })

    def q_value(self, states: np.ndarray, actions: np.ndarray):
        return tf.get_default_session().run(self.q_value_output,
                                            feed_dict={
                                                self.states: states,
                                                self.actions_taken: actions,
                                                self.is_training: False
                                            })

    def actions(self, states: np.ndarray) -> np.ndarray:
        """Get the actions for a batch of states."""
        return tf.get_default_session().run(self.action_output,
                                            feed_dict={
                                                self.states: states,
                                                self.is_training: True
                                            })

    def action(self, state: np.ndarray) -> np.ndarray:
        """Get the action for a single state."""
        return tf.get_default_session().run(self.action_output,
                                            feed_dict={
                                                self.states: [state],
                                                self.is_training: False
                                            })[0]

    def target_actions(self, states: np.ndarray) -> np.ndarray:
        """Get the actions for a batch of states using the target actor network."""
        return tf.get_default_session().run(self.target_action_output,
                                            feed_dict={
                                                self.states: states,
                                                self.is_training: True
                                            })

    def train(self):
        sample = self.replay_buffer.get_batch(self.config["batch_size"])

        # for n_actions = 1
        action_batch = np.resize(sample["actions"],
                                 [self.config["batch_size"], self.n_actions])

        # Calculate critic targets
        next_action_batch = self.target_actions(sample["states1"])
        q_value_batch = self.target_q(sample["states1"], next_action_batch)
        critic_targets = sample["rewards"] + (1 - sample["terminals1"]) * \
            self.config["gamma"] * q_value_batch.squeeze()
        critic_targets = np.resize(
            critic_targets, [self.config["batch_size"], 1]).astype(np.float32)
        # Update actor weights
        fetches = [self.q_value_output, self.critic_loss, self.critic_train_op]
        predicted_q, critic_loss, _ = tf.get_default_session().run(
            fetches,
            feed_dict={
                self.critic_target: critic_targets,
                self.states: sample["states0"],
                self.actions_taken: action_batch,
                self.is_training: True
            })

        summary = tf.Summary()
        summary.value.add(tag="model/critic_loss",
                          simple_value=float(critic_loss))
        summary.value.add(tag="model/predicted_q_mean",
                          simple_value=np.mean(predicted_q))
        summary.value.add(tag="model/predicted_q_std",
                          simple_value=np.std(predicted_q))
        self.summary_writer.add_summary(summary, self.n_updates)

        # Update the actor using the sampled gradient:
        action_batch_for_gradients = self.actions(sample["states0"])
        q_gradient_batch = self.actor_gradients(sample["states0"],
                                                action_batch_for_gradients)

        tf.get_default_session().run(self.actor_train_op,
                                     feed_dict={
                                         self.q_gradient_input:
                                         q_gradient_batch,
                                         self.states: sample["states0"],
                                         self.is_training: True
                                     })

        # Update the target networks
        tf.get_default_session().run(
            [self.update_targets_op, self.model_summary_op])
        self.n_updates += 1

    def noise_action(self, state: np.ndarray):
        """Choose an action based on the actor and exploration noise."""
        action = self.action(state)
        return action + self.action_noise()

    def learn(self):
        max_action = self.env.action_space.high
        with tf.Session() as sess, sess.as_default():
            sess.run(self.init_op)
            for episode in range(self.config["n_episodes"]):
                state = self.env.reset()
                episode_reward = 0
                episode_length = 0
                for _ in range(self.config["n_timesteps"]):
                    action = self.noise_action(state)
                    new_state, reward, done, _ = self.env.step(action *
                                                               max_action)
                    episode_length += 1
                    episode_reward += reward
                    self.replay_buffer.add(state, action, reward, new_state,
                                           done)
                    if self.replay_buffer.n_entries > self.config[
                            "replay_start_size"]:
                        self.train()
                    state = new_state
                    if done:
                        self.action_noise.reset()
                        summary = tf.Summary()
                        summary.value.add(tag="global/Episode_length",
                                          simple_value=float(episode_length))
                        summary.value.add(tag="global/Reward",
                                          simple_value=float(episode_reward))
                        self.summary_writer.add_summary(summary, episode)
                        self.summary_writer.flush()
                        break
Esempio n. 3
0
File: MbPA.py Progetto: esgl/MbPA
class MbPA:
    def __init__(self, sess, args):
        with tf.variable_scope(args.model_name):
            self.args = args
            self.learning_rate = args.learning_rate
            self.session = sess

            self.x = tf.placeholder(tf.float32, shape=[None, 784], name="x")
            self.y = tf.placeholder(tf.float32, shape=[None, 10], name="y")
            # self.trainable = tf.placeholder(tf.int32, shape=(), name="trainable")
            self.memory_sample_batch = tf.placeholder(
                tf.int16, shape=(), name="memory_sample_batch")

            self.embed = self.embedding(self.x)

            self.M = Memory(args.memory_size,
                            self.embed.get_shape()[-1],
                            self.y.get_shape()[-1])
            embs_and_values = tf.py_func(self.get_memory_sample,
                                         [self.memory_sample_batch],
                                         [tf.float64, tf.float64])

            self.memory_batch_x = tf.to_float(embs_and_values[0])
            self.memory_batch_y = tf.to_float(embs_and_values[1])
            self.xa = tf.concat(values=[self.embed, self.memory_batch_x],
                                axis=0)
            self.ya = tf.concat(values=[self.y, self.memory_batch_y], axis=0)

            self.y_ = self.output_network(self.xa)

            self.cross_entropy = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=self.ya,
                                                        logits=self.y_))
            self.optim = tf.train.GradientDescentOptimizer(
                self.learning_rate).minimize(self.cross_entropy)
            self.correct_prediction = tf.equal(tf.argmax(self.ya, 1),
                                               tf.argmax(self.y_, 1))
            self.accuracy = tf.reduce_mean(
                tf.cast(self.correct_prediction, tf.float32))

            self.session.run(tf.global_variables_initializer())

    def train(self, xs, ys, memory_sample_batch):
        # print(memory_sample_batch)
        embeds, _ = self.session.run([self.embed, self.optim],
                                     feed_dict={
                                         self.x:
                                         xs,
                                         self.y:
                                         ys,
                                         self.memory_sample_batch:
                                         memory_sample_batch
                                     })
        return embeds

    def test(self, xs_test, ys_test):
        acc = self.session.run(self.accuracy,
                               feed_dict={
                                   self.x: xs_test,
                                   self.y: ys_test,
                                   self.memory_sample_batch: 0
                               })
        return acc

    def get_memory_sample(self, batch_size):
        x, y = self.M.sample(batch_size)
        return x, y

    def add_to_memory(self, xs, ys):
        if self.args.sample_add == "normal":
            self.M.add(xs, ys)
        elif self.args.sample_add == "lru":
            self.M.add_lru(xs, ys)
        elif self.args.sample_add == "rand":
            self.M.add_rand(xs, ys)
        elif self.args.sample_add == "knn":
            self.M.add_knn(xs, ys)
        elif self.args.sample_add == "knn_lru":
            self.M.add_knn_lru(xs, ys)
        else:
            raise Exception(
                "error sample adding type, pleace choose in ['normal', 'lru', 'rand']"
            )

    @staticmethod
    def embedding(x):
        out = tf.reshape(x, [-1, 28, 28, 1])
        # convs = [(16, 8, 4), (32, 4, 2)]
        # with tf.variable_scope("conv1"):
        # out = layers.convolution2d(inputs=out,
        #                            num_outputs=16,
        #                            kernel_size=8,
        #                            stride=4,
        #                            trainable=trainable)
        # out = tf.nn.relu(out)
        # out = tf.nn.max_pool(out, ksize=[1, 2, 3, 1], strides=[1, 2, 2, 1], padding="SAME")
        with tf.variable_scope("conv2"):
            # out = layers.convolution2d(inputs=out,
            #                            num_outputs=32,
            #                            kernel_size=4,
            #                            stride=2,
            #                            trainable=trainable)
            # out = tf.nn.relu(out)
            # out = tf.nn.max_pool(out, ksize=[1, 2, 3, 1], strides=[1, 2, 2, 1], padding="SAME")
            embed = layers.flatten(out)
        return embed

    @staticmethod
    def output_network(embed):
        out = embed
        with tf.variable_scope("fc_1"):
            out = layers.fully_connected(inputs=out, num_outputs=1024)
            out = tf.nn.relu(out)
        with tf.variable_scope("fc_2"):
            out = layers.fully_connected(inputs=out, num_outputs=10)
        return out
Esempio n. 4
0
class MbPA_KNN_Test:
    def __init__(self, sess, args):
        self.args = args
        self.session = sess
        self.w = {}
        self.eval_w = {}
        with tf.variable_scope(self.args.model_name):
            self.x = tf.placeholder(tf.float32, shape=[None, 784], name="x")
            self.y = tf.placeholder(tf.float32, shape=[None, 10], name="y")
            self.memory_sample_batch = tf.placeholder(
                tf.int16, shape=(), name="memory_sample_batch")
            with tf.variable_scope("training"):
                with tf.variable_scope("embedding"):
                    self.out = tf.reshape(self.x, [-1, 28, 28, 1])
                    with tf.variable_scope("conv"):
                        #         # self.out, self.w["l1_w"], self.w["l1_b"] = conv2d(
                        #         #     x=self.out,
                        #         #     output_dim=16,
                        #         #     kernel_size=[8, 8],
                        #         #     stride=[4, 4],
                        #         #     activation_fn=tf.nn.relu,
                        #         #     name="conv1"
                        #         # )
                        #         # self.out, self.w["l2_w"], self.w["l2_b"] = conv2d(
                        #         #     x=self.out,
                        #         #     output_dim=32,
                        #         #     kernel_size=[4, 4],
                        #         #     stride=[2, 2],
                        #         #     activation_fn=tf.nn.relu,
                        #         #     name="conv2"
                        #         # )
                        self.embed = layers.flatten(self.out)
                #         self.embed_dim = self.embed.get_shape()[-1]
                self.M = Memory(self.args.memory_size,
                                self.x.get_shape()[-1],
                                self.y.get_shape()[-1])
                embs_and_values = tf.py_func(self.get_memory_sample,
                                             [self.memory_sample_batch],
                                             [tf.float64, tf.float64])
                self.memory_batch_x = tf.to_float(embs_and_values[0])
                self.memory_batch_y = tf.to_float(embs_and_values[1])
                self.xa = tf.concat(values=[self.x, self.memory_batch_x],
                                    axis=0)
                self.ya = tf.concat(values=[self.y, self.memory_batch_y],
                                    axis=0)
                with tf.variable_scope("fc"):
                    self.out = self.xa
                    # self.out, self.w["l3_w"], self.w["l3_b"] = linear(
                    #     input_=self.out,
                    #     output_size=1024,
                    #     activation_fn=tf.nn.relu,
                    #     name="fc_1"
                    # )
                    self.out, self.w["l4_w"], self.w["l4_b"] = linear(
                        input_=self.out, output_size=10, name="fc_2")
                    self.ya_ = self.out

                self.cross_entropy = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(labels=self.ya,
                                                            logits=self.ya_))

                self.optim = tf.train.GradientDescentOptimizer(
                    self.args.learning_rate).minimize(self.cross_entropy)
                self.correct_prediction = tf.equal(tf.argmax(self.ya, 1),
                                                   tf.argmax(self.ya_, 1))
                self.accuracy = tf.reduce_mean(
                    tf.cast(self.correct_prediction, tf.float32))

            self.session.run(tf.global_variables_initializer())

    def update_training_to_prediction(self):
        for name in self.eval_w.keys():
            self.t_w_assign_op[name].eval(
                {self.t_w_input[name]: self.w[name].eval()})

    def train(self, xs, ys, memory_sample_batch):
        embeds, _ = self.session.run([self.embed, self.optim],
                                     feed_dict={
                                         self.x:
                                         xs,
                                         self.y:
                                         ys,
                                         self.memory_sample_batch:
                                         memory_sample_batch
                                     })
        return embeds

    def get_memory_sample(self, batch_size):
        xs, ys = self.M.sample(batch_size)
        return xs, ys

    def add_to_memory(self, xs, ys):
        if self.args.sample_add == "normal":
            self.M.add(xs, ys)
        elif self.args.sample_add == "lru":
            self.M.add_lru(xs, ys)
        elif self.args.sample_add == "rand":
            self.M.add_rand(xs, ys)
        elif self.args.sample_add == "knn":
            self.M.add_knn(xs, ys)
        elif self.args.sample_add == "knn_lru":
            self.M.add_knn_lru(xs, ys)
        else:
            raise Exception(
                "error sample adding type, pleace choose in ['normal', 'lru', 'rand']"
            )

    def test(self, xs_test, ys_test):
        # self.update_training_to_prediction()
        acc = self.session.run(self.accuracy,
                               feed_dict={
                                   self.x: xs_test,
                                   self.y: ys_test,
                                   self.memory_sample_batch: 0
                               })
        return acc

    @property
    def memory_length(self):
        return self.M.length