Esempio n. 1
0
class DDPG:
    def __init__(self,
                 env,
                 agentId,
                 logger_folder,
                 max_action=1,
                 max_buffer_size=1000000,
                 batch_size=200,
                 tow=0.005,
                 discount_factor=0.99,
                 actor_learning_rate=0.001,
                 critic_learning_rate=0.001,
                 dtype='float32',
                 timestamp=10000,
                 max_epsiode_steps=1,
                 n_warmup=200,
                 sigma=0.1):

        self.logger_folder = logger_folder
        self.max_buffer_size = max_buffer_size
        self.batch_size = batch_size
        self.max_epsiode_steps = max_epsiode_steps
        self.timestamp = timestamp

        self.n_warmup = n_warmup

        # 80 of time is for exploration
        self.dflt_dtype = dtype

        self.sigma = sigma
        self.tow = tow
        self.gamma = discount_factor
        self.max_action = max_action
        np.random.seed(0)
        random.seed(0)

        self.env = env
        self.agentId = agentId

        if isinstance(self.env, gym.Env):
            self.state_dim = env.observation_space.shape[
                0] if env.observation_space.shape != tuple() else 1
            self.action_dim = env.action_space.shape[
                0] if env.action_space.shape != tuple() else 1
        else:
            self.state_dim = env.state_dim
            self.action_dim = env.action_dim

        self.actor = Actor(self.state_dim, self.action_dim,
                           actor_learning_rate, max_action)

        self.critic = Critic(self.state_dim, self.action_dim,
                             critic_learning_rate)

        update_target_variables(self.actor.target.weights,
                                self.actor.model.weights,
                                tau=1.0)

        update_target_variables(self.critic.target.weights,
                                self.critic.model.weights,
                                tau=1.0)

        self.device = '/cpu:0'

    def get_action(self, state):
        """
		Predicting the action with the actor model from the state
		C2: ||Wk||<=1 so we always divide by its norm so ||Wk||==1
		"""
        is_single_state = len(state.shape) == 1

        state = np.expand_dims(state, axis=0).astype(
            np.float32) if is_single_state else state

        action = self._get_action_body(tf.constant(state), self.sigma)

        return action.numpy()[0] if is_single_state else action.numpy()

    @tf.function
    def _get_action_body(self, state, sigma):
        with tf.device(self.device):
            action = self.actor.model(state)
            action += tf.random.normal(shape=action.shape,
                                       mean=0,
                                       stddev=sigma,
                                       dtype=tf.float32)

            return tf.clip_by_norm(
                tf.clip_by_value(action, 0, self.max_action), 1)

    def train_step(self, states_batch, actions_batch, rewards_batch,
                   next_states_batch, done_batch):
        """
		Performing the update of the models (Actor and Critic) using the gradients on batches with size
		by default == 64
		"""
        actor_loss, critic_loss, td_error = self._train_step_body(
            states_batch, actions_batch, next_states_batch, rewards_batch,
            done_batch)

        if actor_loss is not None:
            tf.summary.scalar(name='Training/actor_loss', data=actor_loss)

        tf.summary.scalar(name='Training/critic_loss', data=critic_loss)

        return td_error

    @tf.function
    def _train_step_body(self, states, actions, next_states, rewards, dones):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_errors = self._compute_td_error_body(
                    states, actions, next_states, rewards, dones)
                critic_loss = tf.reduce_mean(tf.square(td_errors))

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.model.trainable_weights)
            self.critic.adam_optimizer.apply_gradients(
                zip(critic_grad, self.critic.model.trainable_weights))

            with tf.GradientTape() as tape:
                next_action = self.actor.model(states)
                actor_loss = -tf.reduce_mean(
                    self.critic.model([states, next_action]))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.model.trainable_variables)
            self.actor.adam_optimizer.apply_gradients(
                zip(actor_grad, self.actor.model.trainable_variables))

            update_target_variables(self.actor.target.weights,
                                    self.actor.model.weights,
                                    tau=self.tow)

            update_target_variables(self.critic.target.weights,
                                    self.critic.model.weights,
                                    tau=self.tow)

            return actor_loss, critic_loss, td_errors

    @tf.function
    def _compute_td_error_body(self, states, actions, next_states, rewards,
                               dones):
        with tf.device(self.device):
            not_dones = 1. - dones
            target_Q = self.critic.target(
                [next_states, self.actor.target(next_states)])
            target_Q = rewards + (not_dones * self.gamma * target_Q)
            target_Q = tf.stop_gradient(target_Q)
            current_Q = self.critic.model([states, actions])
            td_errors = target_Q - current_Q
        return td_errors

    def train(self, action_queue, matrix_queue):

        total_steps = 0
        tf.summary.experimental.set_step(total_steps)
        episode_steps = 0
        episode_return = 0
        episode_start_time = time.perf_counter()
        n_episode = 0

        writer = tf.summary.create_file_writer('logs/' + self.logger_folder +
                                               '/' + str(self.agentId))
        writer.set_as_default()

        tf.summary.experimental.set_step(total_steps)

        replay_buffer = Replay_Buffer(self.max_buffer_size, self.batch_size)

        old_state = self.env.reset()
        max_reward = 0

        while total_steps < self.timestamp:
            if total_steps < self.n_warmup:
                action = np.random.uniform(low=0, high=1, size=self.action_dim)
                action = action / np.linalg.norm(action)

            else:
                action = self.get_action(old_state)

#			action = normalize(action.reshape(self.env.M, self.env.K)).reshape(self.action_dim)
            new_state, reward, done, action = self.env.step(
                action, self.agentId)
            if reward > max_reward:
                best_action = action
                max_reward = reward

            episode_steps += 1
            episode_return += reward
            total_steps += 1
            tf.summary.experimental.set_step(total_steps)

            replay_buffer.add_experience(old_state, action, reward, new_state,
                                         done)

            done_flag = done

            old_state = new_state

            tf.summary.scalar(name="Episode/Reward", data=reward)

            if done or episode_steps == self.max_epsiode_steps:
                fps = (time.perf_counter() - episode_start_time) / fps
                print(
                    "Agent: {5: 3}, Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                    .format(n_episode, total_steps, episode_steps,
                            episode_return, fps, self.agentId))

                n_episode += 1

                episode_steps = 0
                episode_return = 0
                episode_start_time = time.perf_counter()

                action_queue.put(best_action)
                W = matrix_queue.get()
                self.env.set_W(W)

            if total_steps < self.n_warmup:
                continue

            states_batch, actions_batch, rewards_batch, next_states_batch, done_batch = replay_buffer.sample_batch(
                self.state_dim, self.action_dim)

            self.train_step(states_batch, actions_batch, rewards_batch,
                            next_states_batch, done_batch)
class DDPG:
    def __init__(self,
                 env,
                 actor_weights=None,
                 critic_weights=None,
                 max_action=1,
                 max_buffer_size=1000000,
                 batch_size=200,
                 tow=0.005,
                 discount_factor=0.99,
                 actor_learning_rate=0.001,
                 critic_learning_rate=0.001,
                 dtype='float32',
                 timestamp=100000,
                 max_epsiode_steps=1000,
                 n_warmup=200,
                 sigma=0.1):

        self.max_epsiode_steps = max_epsiode_steps
        self.timestamp = timestamp

        self.n_warmup = n_warmup

        # 80 of time is for exploration
        self.dflt_dtype = dtype

        self.max_buffer_size = max_buffer_size
        self.batch_size = batch_size

        self.sigma = sigma
        self.tow = tow
        self.gamma = discount_factor
        self.max_action = max_action
        np.random.seed(0)
        random.seed(0)

        self.env = env

        if isinstance(self.env, gym.Env):
            self.state_dim = env.observation_space.shape[
                0] if env.observation_space.shape != tuple() else 1
            self.action_dim = env.action_space.shape[
                0] if env.action_space.shape != tuple() else 1
        else:
            self.state_dim = env.state_dim
            self.action_dim = env.action_dim

        self.actor = Actor(self.state_dim, self.action_dim,
                           actor_learning_rate, max_action)

        self.critic = Critic(self.state_dim, self.action_dim,
                             critic_learning_rate)

        #		if actor_weights:
        #			update_target_variables(
        #				self.actor.model.weights, actor_weights, tau=1.0)

        #		if critic_weights:
        #			update_target_variables(
        #				self.critic.model.weights, critic_weights, tau=1.0)

        update_target_variables(self.actor.target.weights,
                                self.actor.model.weights,
                                tau=1.0)

        update_target_variables(self.critic.target.weights,
                                self.critic.model.weights,
                                tau=1.0)

        # Setting the buffer
        self.buffer = Replay_Buffer(self.max_buffer_size, self.batch_size,
                                    self.dflt_dtype)

        self.device = '/gpu:1'

    def get_action(self, state):
        """
		Predicting the action with the actor model from the state
		C2: ||Wk||<=1 so we always divide by its norm so ||Wk||==1
		"""
        is_single_state = len(state.shape) == 1

        state = np.expand_dims(state, axis=0).astype(
            np.float32) if is_single_state else state

        action = self._get_action_body(tf.constant(state), self.sigma)
        return action.numpy()[0] if is_single_state else action.numpy()

    @tf.function
    def _get_action_body(self, state, sigma):
        with tf.device(self.device):
            action = self.actor.model(state)
            action += tf.random.normal(shape=action.shape,
                                       mean=0,
                                       stddev=sigma,
                                       dtype=tf.float32)

            return tf.clip_by_value(action, 0, self.max_action)

    def train_step(self, states_batch, actions_batch, rewards_batch,
                   next_states_batch, done_batch):
        """
		Performing the update of the models (Actor and Critic) using the gradients on batches with size
		by default == 64
		"""
        actor_loss, critic_loss, td_error = self._train_step_body(
            states_batch, actions_batch, next_states_batch, rewards_batch,
            done_batch)

        if actor_loss is not None:
            tf.summary.scalar(name='Training/actor_loss', data=actor_loss)

        tf.summary.scalar(name='Training/critic_loss', data=critic_loss)

        return td_error

    @tf.function
    def _train_step_body(self, states, actions, next_states, rewards, dones):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_errors = self._compute_td_error_body(
                    states, actions, next_states, rewards, dones)
                critic_loss = tf.reduce_mean(tf.square(td_errors))

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.model.trainable_weights)
            self.critic.adam_optimizer.apply_gradients(
                zip(critic_grad, self.critic.model.trainable_weights))

            with tf.GradientTape() as tape:
                next_action = self.actor.model(states)
                actor_loss = -tf.reduce_mean(
                    self.critic.model([states, next_action]))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.model.trainable_variables)
            self.actor.adam_optimizer.apply_gradients(
                zip(actor_grad, self.actor.model.trainable_variables))

            update_target_variables(self.actor.target.weights,
                                    self.actor.model.weights,
                                    tau=self.tow)

            update_target_variables(self.critic.target.weights,
                                    self.critic.model.weights,
                                    tau=self.tow)

            return actor_loss, critic_loss, td_errors

    @tf.function
    def _compute_td_error_body(self, states, actions, next_states, rewards,
                               dones):
        with tf.device(self.device):
            not_dones = 1. - dones
            target_Q = self.critic.target(
                [next_states, self.actor.target(next_states)])
            target_Q = rewards + (not_dones * self.gamma * target_Q)
            target_Q = tf.stop_gradient(target_Q)
            current_Q = self.critic.model([states, actions])
            td_errors = target_Q - current_Q
        return td_errors

    def train(self):

        total_steps = 0
        tf.summary.experimental.set_step(total_steps)
        episode_steps = 0
        episode_return = 0
        episode_start_time = time.perf_counter()
        n_episode = 0

        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        train_log_dir = 'logs/' + current_time
        #		train_log_dir = 'logs/gradient_tape/' + "AP="+str(self.env.M)+";Us="+str(self.env.K)+";Pr="+str(self.env.transmission_power)
        writer = tf.summary.create_file_writer(train_log_dir)
        writer.set_as_default()

        tf.summary.experimental.set_step(total_steps)

        old_state = self.env.reset()
        max_reward_episode = 0
        start_state = old_state

        last_start = old_state

        fps = 0
        while total_steps < self.timestamp:
            if total_steps < self.n_warmup:
                action = np.random.uniform(low=0, high=1, size=self.action_dim)
#				action = (action.reshape(15,5)/np.linalg.norm(action.reshape(15,5),axis=1).reshape(15,1)).reshape(self.action_dim)

            else:
                action = self.get_action(old_state)

#			action = normalize(action.reshape(self.env.M, self.env.K)).reshape(self.action_dim)
            new_state, reward, done, _ = self.env.step(action)
            episode_steps += 1
            episode_return += reward
            total_steps += 1
            tf.summary.experimental.set_step(total_steps)

            self.buffer.add_experience(old_state, action, reward, new_state,
                                       done)

            done_flag = done

            old_state = new_state

            tf.summary.scalar(name="Episode/Reward", data=reward)

            if done or episode_steps == self.max_epsiode_steps:
                fps = (time.perf_counter() -
                       episode_start_time) / episode_steps
                print(
                    "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}"
                    .format(n_episode, total_steps, episode_steps,
                            episode_return, fps))

                if episode_return > max_reward_episode:
                    print("Start From Old")
                    max_reward_episode = episode_return
                    start_state = old_state
                else:
                    old_state = last_start

                last_start = old_state

                #				old_state = self.env.reset()
                episode_start_time = time.perf_counter()

                n_episode += 1

                episode_steps = 0
                episode_return = 0

            if total_steps < self.n_warmup:
                continue

            states_batch, actions_batch, rewards_batch, next_states_batch, done_batch = self.buffer.sample_batch(
                self.state_dim, self.action_dim)

            self.train_step(states_batch, actions_batch, rewards_batch,
                            next_states_batch, done_batch)

        state = self.env.reset()
        state = np.expand_dims(state, axis=0).astype(np.float32)
        step = 0
        max_reward = 0
        s = time.time()
        while step < 1000:
            step += 1
            action = self.actor.model(state)
            state, reward, _, _ = self.env.step(action.numpy())
            if reward > max_reward:
                max_reward = reward
                fps = time.time() - s
            state = np.expand_dims(state, axis=0).astype(np.float32)
            print(
                "Steps: {1: 7} Return: {3: 5.4f} time to converge: {4: 3.7f}".
                format(n_episode, step, episode_steps, reward, fps))

        return self.actor.model.weights, self.critic.model.weights