Ejemplo n.º 1
0
class PPOModel:
    def __init__(self,
            num_states,
            num_actions=6 ,
            should_load_models=False,
            hidden_size=64,
            num_hidden_layers = 2,
            ep_clip=0.2,
            gamma=0.99,
            lam=0.95,
            entropy_coeff=0.0,
            epochs=5,
            batch_size=16,
            learning_rate=3e-4,
            use_conv = False):
        self.training_json = "model_weights/training.json"
        self.num_states = num_states
        self.should_load_models = should_load_models
        self.num_actions = num_actions
        self.hidden_size = hidden_size
        self.batch_size=batch_size
        self.num_hidden_layers = num_hidden_layers
        self.learning_rate = learning_rate
        self.epsilon_clip = ep_clip
        global epsilon_clip
        epsilon_clip = ep_clip
        self.distribution = NormalDistribution(num_actions=num_actions, var=var)
        self.use_conv = use_conv
        self.build_actor_and_critic()
        self.gamma = gamma
        self.lam = lam
        self.entropy_coeff = entropy_coeff
        self.epochs = epochs
        self.train_lock = threading.Lock()
        self.dummy_action=np.zeros((1,self.num_actions))
        self.dummy_value=np.zeros((1, 1))
        self.create_summary_writers()
        
    def build_actor_and_critic(self):
        self.build_actor()
        self.build_critic()        
        if self.should_load_models:
            self.load_model_weights()
        else:
            self.training_info = {"episode" : 0}
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)

    def build_actor(self):
        inputs = tf.keras.Input(shape=(self.num_states,))
        advantage = tf.keras.Input(shape=(1,))
        old_pred = tf.keras.Input(shape=(self.num_actions,))

        x = tf.keras.layers.Dense(self.hidden_size, activation="relu", kernel_initializer=tf.random_normal_initializer())(inputs)
        for _ in range(self.num_hidden_layers - 1):
            x = tf.keras.layers.Dense(self.hidden_size, activation="relu", kernel_initializer=tf.random_normal_initializer())(x)
        out_actor = tf.keras.layers.Dense(self.num_actions, kernel_initializer=tf.random_normal_initializer())(x)
        self.actor = tf.keras.models.Model(inputs=[inputs, advantage, old_pred], outputs=[out_actor])
        self.actor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
                loss=[ppo_loss_continuous(
                    advantage=advantage,
                    old_pred=old_pred)],
                experimental_run_tf_function=False)
        self.actor.summary()

    def build_critic(self):
        inputs = tf.keras.Input(shape=(self.num_states,))
        x = tf.keras.layers.Dense(self.hidden_size, activation="relu", kernel_initializer=tf.random_normal_initializer())(inputs)
        for _ in range(self.num_hidden_layers - 1):
            x = tf.keras.layers.Dense(self.hidden_size, activation="relu", kernel_initializer=tf.random_normal_initializer())(x)
        out_critic = tf.keras.layers.Dense(1, kernel_initializer=tf.random_normal_initializer())(x)
        self.critic = tf.keras.models.Model(inputs=[inputs], outputs=[out_critic])
        self.critic.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate), loss="mse")
        self.critic.summary()

    def build_actor_conv(self):
        inputs = tf.keras.Input(shape=(self.num_states,1,1))
        x = tf.keras.layers.Conv2D(filters=16, kernel_size=(8,8), strides=(4,4), padding="VALID", activation="relu")(inputs)
        x = tf.keras.layers.Conv2D(filters=32, kernel_size=(4,4), strides=(2,2), padding="VALID", activation="relu")(x)
        out_actor = tf.keras.layers.Dense(self.num_actions, activation="linear", kernel_initializer=tf.random_normal_initializer())(x)
        self.actor = tf.keras.models.Model(inputs=[inputs], outputs=[out_actor])

    def build_critic_conv(self):
        inputs = tf.keras.Input(shape=(self.num_states,1,1))
        x = tf.keras.layers.Conv2D(filters=16, kernel_size=(8,8), strides=(4,4), padding="VALID", activation="relu")(inputs)
        x = tf.keras.layers.Conv2D(filters=32, kernel_size=(4,4), strides=(2,2), padding="VALID", activation="relu")(x)
        out_critic = tf.keras.layers.Dense(1, activation="linear", kernel_initializer=tf.random_normal_initializer())(x)
        self.critic = tf.keras.models.Model(inputs=[inputs], outputs=[out_critic])

    def create_summary_writers(self):
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        actor_log_dir= "logs/fit/" + current_time + "/actor"
        critic_log_dir= "logs/fit/" + current_time + "/critic"
        returns_log_dir= "logs/fit/" + current_time + "/returns"
        self.actor_summary_writer = tf.summary.create_file_writer(actor_log_dir)
        self.critic_summary_writer = tf.summary.create_file_writer(critic_log_dir)
        self.returns_summary_writer = tf.summary.create_file_writer(returns_log_dir) 
    
    def next_action_and_value(self, observ):
        self.distribution.mean = self.actor(observ)
        return self.distribution.sample(), self.critic([observ, self.dummy_action, self.dummy_value])
    

    def add_vtarg_and_adv(self, ep_dic):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        T = len(ep_dic["rewards"])
        ep_dic["values"] = np.append(ep_dic["values"], 0)
        ep_dic["adv"] = gaelam = np.empty(T, 'float32')
        lastgaelam = 0
        for t in range(T-1, -1, -1):
            nonterminal = 0 if t == T-1 else 1
            delta = ep_dic["rewards"][t] + self.gamma * ep_dic["values"][t+1] * nonterminal- ep_dic["values"][t] # TD Error
            gaelam[t] = lastgaelam = delta + self.gamma * self.lam * lastgaelam * nonterminal
        ep_dic["values"] = np.delete(ep_dic["values"], -1)
        ep_dic["tdlamret"] = ep_dic["adv"] + ep_dic["values"]
        ep_dic["adv"] = (ep_dic["adv"] - ep_dic["adv"].mean()) / ep_dic["adv"].std()
    
    def add_ret_and_adv(self, ep_dic):
        T = len(ep_dic["rewards"])
        ep_dic["adv"] = np.empty(T, 'float32')
        ep_dic["returns"] = np.empty(T, 'float32')
        for t in range(T-1, -1, -1):
            ep_dic["returns"][t] = ep_dic["rewards"][t]
            if t < T-1:
                ep_dic["returns"][t] += ep_dic["returns"][t+1] * self.gamma
        ep_dic["adv"] = ep_dic["returns"] - ep_dic["values"]
        #ep_dic["adv"] = (ep_dic["adv"] - ep_dic["adv"].mean()) / ep_dic["adv"].std()

    def train(self, ep_dic):
        self.add_vtarg_and_adv(ep_dic)
        with self.train_lock:
            print("Training")
            #print("Rewards: ", ep_dic["rewards"])
            
            #print("tdlamret: ", ep_dic["tdlamret"])
            #print("Values: ", ep_dic["values"])
            epoch_bonus = 0#5 if ep_dic["rewards"][-1] > 0 else 0 
            
            #print("REWARD: ", ep_dic["rewards"][-1])
            #self.shuffle_ep_dic(ep_dic)
            print("Episode Return: ", ep_dic["episode_return"])
            observ_arr = np.array(ep_dic["observations"])
            observ_arr = np.reshape(observ_arr, (observ_arr.shape[0], observ_arr.shape[2]))
            ep_dic["adv"] = np.reshape(ep_dic["adv"], (ep_dic["adv"].shape[0], 1))
            ep_dic["means"] = np.array([mean.numpy()[0] for mean in ep_dic["means"]])
            ep_dic["actions"] = np.array([action.numpy()[0] for action in ep_dic["actions"]])
           
            actor_history = self.actor.fit(
                    [observ_arr, ep_dic["adv"], ep_dic["means"]],
                    [ep_dic["actions"]],
                    batch_size=self.batch_size,
                    epochs=self.epochs,
                    shuffle=True)
            critic_history = self.critic.fit(
                    observ_arr,
                    [ep_dic["tdlamret"]],
                    batch_size=self.batch_size,
                    epochs=self.epochs,
                    shuffle=True)
            
            with self.actor_summary_writer.as_default():
                tf.summary.scalar("loss", actor_history.history["loss"][-1], step=self.training_info["episode"])
            
            with self.critic_summary_writer.as_default():
                tf.summary.scalar("loss", critic_history.history["loss"][-1], step=self.training_info["episode"])
            
            with self.returns_summary_writer.as_default():
                tf.summary.scalar("Return", ep_dic["episode_return"], step=self.training_info["episode"])

            self.training_info["episode"] += 1
            self.save_model_weights()
            print("Done Training")
            return self.actor.get_weights(), self.critic.get_weights()

    def shuffle_ep_dic(self, ep_dic):
        seed = random.random()
        for k in ep_dic:
            if isinstance(ep_dic[k], list):
                random.seed(seed)
                random.shuffle(ep_dic[k])

    def value_loss(self, value, ret):
        return tf.reduce_mean(tf.square(ret - value))

    def ppo_loss(self, ep_dic, index):
        _, cur_val = self.next_action_and_value(ep_dic["observations"][index])

        old_distribution = NormalDistribution(mean=ep_dic["means"][index])
        kl_divergence = self.distribution.kl(old_distribution)
        entropy = self.distribution.entropy()
        mean_kl = tf.reduce_mean(kl_divergence)
        mean_entropy = tf.reduce_mean(entropy)
        policy_entropy_pen = -self.entropy_coeff * mean_entropy
        ratio = tf.exp(self.distribution.logp(ep_dic["actions"][index]) - old_distribution.logp(ep_dic["actions"][index]))
        surrogate_1 = ratio * ep_dic["adv"][index]
        surrogate_2 = tf.clip_by_value(ratio, 1.0 - self.epsilon_clip, 1.0 + self.epsilon_clip) * ep_dic["adv"][index]
        policy_surrogate = - tf.reduce_mean(tf.minimum(surrogate_1, surrogate_2))
        #value_fn_loss = tf.reduce_mean(tf.square(ep_dic["tdlamret"][index] - cur_val))
        
        #value_fn_loss = tf.reduce_mean(tf.square(tf.convert_to_tensor(ep_dic["returns"][index]) - cur_val[0][0]))

        total_loss = policy_surrogate + policy_entropy_pen 
        return total_loss#, value_fn_loss
    
    def save_models(self):
        if self.use_conv:
            self.actor.save("actor_model_conv.h5")
            self.critic.save("critic_model_conv.h5")
        else:
            self.actor.save("actor_model.h5")
            self.critic.save("critic_model.h5")
        with open(self.training_json, "w") as f:
            json.dump(self.training_info, f)

    def load_models(self):
        print("LOADING MODELS")
        if self.use_conv:
            self.actor = tf.keras.models.load_model("actor_model_conv.h5")
            self.critic = tf.keras.models.load_model("critic_model_conv.h5")
        else:
            self.actor = tf.keras.models.load_model("actor_model.h5")
            self.critic = tf.keras.models.load_model("critic_model.h5")
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        self.actor.summary()
        self.critic.summary()

        with open(self.training_json, "r") as f:
            self.training_info = json.load(f)

    def save_model_weights(self):
        self.actor.save_weights("model_weights/actor_model")
        self.critic.save_weights("model_weights/critic_model")
        with open(self.training_json, "w") as f:
            json.dump(self.training_info, f)


    def load_model_weights(self):
        print("LOADING MODEL")
        self.actor.load_weights("model_weights/actor_model")
        self.critic.load_weights("model_weights/critic_model")
        
        with open(self.training_json, "r") as f:
            self.training_info = json.load(f)
Ejemplo n.º 2
0
class AgentWorker:
    def __init__(self, ppo_model):
        self.ppo_model = ppo_model
        self.local_actor = tf.keras.models.clone_model(self.ppo_model.actor)
        self.local_critic = tf.keras.models.clone_model(self.ppo_model.critic)
        self.local_actor.set_weights(self.ppo_model.actor.get_weights())
        self.local_critic.set_weights(self.ppo_model.critic.get_weights())
        self.header_len = 8
        self.distribution = NormalDistribution()

    def start_training_agent(self, conn):
        with conn:
            self.set_should_explore(conn)
            while True:
                ep_dic = self.run_episode(conn)

                global_actor_weights, global_critic_weights = self.ppo_model.train(
                    ep_dic)
                self.local_actor.set_weights(global_actor_weights)
                self.local_critic.set_weights(global_critic_weights)

                end_msg = "0" * 3
                endLenStr = str(len(end_msg))
                endLenStr = (self.header_len -
                             len(endLenStr)) * "0" + endLenStr
                conn.send(endLenStr.encode())
                conn.send(end_msg.encode())

    def run_episode(self, conn):
        observs = []
        rewards = []
        val_preds = []
        actions = []
        means = []
        #prev_acts = []
        ep_ret = 0
        while True:
            # Read the packet header
            data_header = conn.recv(self.header_len)
            data_len = int(data_header.decode().rstrip("\x00"))
            #print(data_len)
            data = conn.recv(data_len, socket.MSG_WAITALL)

            if not data:
                break

            env_dic = json.loads(data)
            state_arr = np.array(env_dic["state"])
            state_arr = np.reshape(state_arr, (1, state_arr.shape[0]))
            observs.append(state_arr)
            rewards.append(env_dic["reward"])
            ep_ret += env_dic["reward"]

            #print(env_dic["reward"])
            action, val = self.next_action_and_value(state_arr)
            actions.append(action)
            #print("ACTION: ",  action)
            action = action.numpy()[0]
            # print("action: ", action)
            #print("value:", val)

            val_preds.append(val)
            means.append(self.distribution.mean)

            if env_dic["done"]:
                # Tell the game to restart the episode
                # conn.close()
                return {
                    "observations": observs,
                    "rewards": rewards,
                    "values": val_preds,
                    "actions": actions,
                    "means": means,
                    "episode_return": ep_ret,
                }
            else:
                # Send next action
                action = self.format_action(action.tolist())
                actionLenStr = str(len(action))
                actionLenStr = (self.header_len -
                                len(actionLenStr)) * "0" + actionLenStr
                conn.send(actionLenStr.encode())
                conn.send(action.encode())

    def next_action_and_value(self, observ):
        self.distribution.mean = self.local_actor(observ)
        if self.should_explore:
            return self.distribution.sample(), self.local_critic(observ)
        else:
            return self.distribution.mean, self.local_critic(observ)

    def format_action(self, action):
        action_dic = {
            "vertical": action[0],
            "horizontal": action[1],
            "pitch": action[2],
            "yaw": action[3],
            "jump": action[4],
            "attack": action[5]
        }
        return json.dumps(action_dic)

    def set_should_explore(self, conn):
        data_header = conn.recv(self.header_len)
        data_len = int(data_header.decode().rstrip("\x00"))
        data = conn.recv(data_len, socket.MSG_WAITALL)
        if not data:
            return
        explore_dic = json.loads(data)
        self.should_explore = explore_dic["explore"]