Ejemplo n.º 1
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, num_actions = get_env_info(
            self.env_id)

        # seeding
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        if env_continuous:
            self.policy_net = Policy(num_states, num_actions).to(device)
        else:
            self.policy_net = DiscretePolicy(num_states,
                                             num_actions).to(device)

        self.value_net = Value(num_states).to(device)

        self.ac_net = Actor_Critic(self.policy_net, self.value_net).to(device)

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_a2c.p".format(self.env_id))
            self.ac_net, self.running_state = pickle.load(
                open('{}/{}_a2c.p'.format(self.model_path, self.env_id), "rb"))

        self.collector = MemoryCollector(self.env,
                                         self.ac_net,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_ac = optim.Adam(self.ac_net.parameters(), lr=self.lr_ac)
Ejemplo n.º 2
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, num_actions = get_env_info(
            self.env_id)
        tf.keras.backend.set_floatx('float64')
        # seeding
        np.random.seed(self.seed)
        tf.random.set_seed(self.seed)
        self.env.seed(self.seed)

        if env_continuous:
            self.policy_net = Policy(num_states, num_actions)  # current policy
        else:
            self.policy_net = DiscretePolicy(num_states, num_actions)

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_reinforce_tf2.p".format(self.env_id))
            self.running_state = pickle.load(
                open(
                    '{}/{}_reinforce_tf2.p'.format(self.model_path,
                                                   self.env_id), "rb"))
            self.policy_net.load_weights("{}/{}_reinforce_tf2".format(
                self.model_path, self.env_id))

        self.collector = MemoryCollector(self.env,
                                         self.policy_net,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_p = optim.Adam(lr=self.lr_p, clipnorm=20)
Ejemplo n.º 3
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert not env_continuous, "DQN is only applicable to discontinuous environment !!!!"

        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        # initialize networks
        self.value_net = QNet_dqn(num_states, self.num_actions).to(device)
        self.value_net_target = QNet_dqn(num_states,
                                         self.num_actions).to(device)

        self.running_state = ZFilter((num_states, ), clip=5)

        # load model if necessary
        if self.model_path:
            print("Loading Saved Model {}_dqn.p".format(self.env_id))
            self.value_net, self.running_state = pickle.load(
                open('{}/{}_dqn.p'.format(self.model_path, self.env_id), "rb"))

        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer = optim.Adam(self.value_net.parameters(), lr=self.lr_q)
Ejemplo n.º 4
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert not env_continuous, "DoubleQN is only applicable to discontinuous environment !!!!"

        tf.keras.backend.set_floatx('float64')

        # seeding
        np.random.seed(self.seed)
        tf.random.set_seed(self.seed)
        self.env.seed(self.seed)

        # initialize networks
        self.value_net = QNet_dqn(num_states, self.num_actions)
        self.value_net_target = QNet_dqn(num_states, self.num_actions)
        self.running_state = ZFilter((num_states, ), clip=5)

        # load model if necessary
        if self.model_path:
            print("Loading Saved Model {}_double_dqn_tf2.p".format(
                self.env_id))
            self.running_state = pickle.load(
                open(
                    '{}/{}_double_dqn_tf2.p'.format(self.model_path,
                                                    self.env_id), "rb"))
            self.value_net.load_weights("{}/{}_double_dqn_tf2".format(
                self.model_path, self.env_id))

        self.value_net_target.set_weights(self.value_net.get_weights())

        self.optimizer = optim.Adam(lr=self.lr_q)
Ejemplo n.º 5
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(self.env_id)
        assert env_continuous, "SAC is only applicable to continuous environment !!!!"

        self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(num_states, self.num_actions, max_action=self.action_high).double().to(device)

        self.value_net = Value(num_states).double().to(device)
        self.value_net_target = Value(num_states).double().to(device)

        self.q_net_1 = QValue(num_states, self.num_actions).double().to(device)
        self.q_net_2 = QValue(num_states, self.num_actions).double().to(device)

        self.running_state = ZFilter((num_states,), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_sac.p".format(self.env_id))
            self.policy_net, self.value_net, self.q_net_1, self.q_net_2, self.running_state \
                = pickle.load(open('{}/{}_sac.p'.format(self.model_path, self.env_id), "rb"))

        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)
def main(env_id, n_trajs, model_path, data_path, render, seed):
    """
    Collect trajectories from pre-trained models by PPO
    """
    env, _, num_states, num_actions = get_env_info(env_id)

    # seed
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    states, actions, rewards, ep_rewards = [], [], [], []

    model = pickle.load(open(model_path, 'rb'))
    model.running_state.fix = True
    for i_iter in range(1, n_trajs + 1):

        state = env.reset()
        ep_reward = 0
        n_step = 0

        while True:
            if render:
                env.render()
            state = model.running_state(state)
            action, _ = model.choose_action(state)
            action = action.cpu().numpy()[0]
            state, reward, done, _ = env.step(action)

            ep_reward += reward
            n_step += 1

            states.append(state)
            actions.append(action)
            rewards.append(reward)

            if done:
                ep_rewards.append(ep_reward)
                print(
                    f"Iter: {i_iter}, step: {n_step}, episode Reward: {ep_reward}"
                )
                break

    env.close()

    states = np.r_[states].reshape((-1, num_states))
    actions = np.r_[actions].reshape((-1, num_actions))
    rewards = np.r_[rewards].reshape((-1, 1))
    ep_rewards = np.r_[ep_rewards].reshape((n_trajs, -1))

    numpy_dict = {
        'state': states,
        'action': actions,
        'reward': rewards,
        'ep_reward': ep_rewards,
    }  # type: Dict[str, np.ndarray]

    if data_path is not None:
        np.savez(f"{data_path}/{env_id}.npz", **numpy_dict)
Ejemplo n.º 7
0
    def _init_model(self):
        # seeding
        seed = self.config["train"]["general"]["seed"]
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env, env_continuous, num_states, num_actions = get_env_info(
            self.env_id)

        # check env
        assert num_states == self.expert_dataset.num_states and num_actions == self.expert_dataset.num_actions, \
            "Expected corresponding expert dataset and env"

        dim_dict = {"dim_state": num_states, "dim_action": num_actions}

        self.config["value"].update(dim_dict)
        self.config["policy"].update(dim_dict)
        self.config["discriminator"].update(dim_dict)

        self.value = Value(dim_state=self.config["value"]["dim_state"],
                           dim_hidden=self.config["value"]["dim_hidden"],
                           activation=resolve_activate_function(
                               self.config["value"]["activation"]))
        self.policy = Policy(config=self.config["policy"])

        self.discriminator = Discriminator(
            dim_state=self.config["discriminator"]["dim_state"],
            dim_action=self.config["discriminator"]["dim_action"],
            dim_hidden=self.config["discriminator"]["dim_hidden"],
            activation=resolve_activate_function(
                self.config["discriminator"]["activation"]))

        self.discriminator_func = nn.BCELoss()
        self.running_state = None

        self.collector = MemoryCollector(self.env,
                                         self.policy,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        print("Model Structure")
        print(self.policy)
        print(self.value)
        print(self.discriminator)
        print()

        self.optimizer_policy = optim.Adam(
            self.policy.parameters(),
            lr=self.config["policy"]["learning_rate"])
        self.optimizer_value = optim.Adam(
            self.value.parameters(), lr=self.config["value"]["learning_rate"])
        self.optimizer_discriminator = optim.Adam(
            self.discriminator.parameters(),
            lr=self.config["discriminator"]["learning_rate"])

        to_device(self.value, self.policy, self.discriminator,
                  self.discriminator_func)
Ejemplo n.º 8
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert env_continuous, "SAC is only applicable to continuous environment !!!!"

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        self.target_entropy = -np.prod(self.env.action_space.shape)
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(num_states,
                                 self.num_actions,
                                 max_action=self.action_high,
                                 use_sac=True).double().to(device)

        self.q_net_1 = QValue(num_states, self.num_actions).double().to(device)
        self.q_net_target_1 = QValue(num_states,
                                     self.num_actions).double().to(device)
        self.q_net_2 = QValue(num_states, self.num_actions).double().to(device)
        self.q_net_target_2 = QValue(num_states,
                                     self.num_actions).double().to(device)

        # self.alpha init
        self.alpha = torch.exp(torch.zeros(
            1, device=device).double()).requires_grad_()

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_sac_alpha.p".format(self.env_id))
            self.policy_net, self.q_net_1, self.q_net_2, self.running_state \
                = pickle.load(open('{}/{}_sac_alpha.p'.format(self.model_path, self.env_id), "rb"))

        self.q_net_target_1.load_state_dict(self.q_net_1.state_dict())
        self.q_net_target_2.load_state_dict(self.q_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(),
                                        lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(),
                                        lr=self.lr_q)
Ejemplo n.º 9
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, num_actions = get_env_info(
            self.env_id
        )

        tf.keras.backend.set_floatx("float64")

        # seeding
        np.random.seed(self.seed)
        tf.random.set_seed(self.seed)
        self.env.seed(self.seed)

        if env_continuous:
            self.policy_net = Policy(num_states, num_actions)
        else:
            self.policy_net = DiscretePolicy(num_states, num_actions)

        self.value_net = Value(num_states, l2_reg=1e-3)
        self.running_state = ZFilter((num_states,), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_trpo_tf2.p".format(self.env_id))
            self.running_state = pickle.load(
                open(
                    "{}/{}_trpo_tf2.p".format(self.model_path, self.env_id),
                    "rb",
                )
            )
            self.policy_net.load_weights(
                "{}/{}_trpo_tf2_p".format(self.model_path, self.env_id)
            )
            self.value_net.load_weights(
                "{}/{}_trpo_tf2_v".format(self.model_path, self.env_id)
            )

        self.collector = MemoryCollector(
            self.env,
            self.policy_net,
            render=self.render,
            running_state=self.running_state,
            num_process=self.num_process,
        )

        self.optimizer_v = optim.Adam(lr=self.lr_v)
Ejemplo n.º 10
0
def main(env_id, n_trajs, model_path, data_path, render, seed, obs_type):
    """
    Collect trajectories from pre-trained models by PPO
    """
    if data_path is not None:
        check_path(data_path)

    env, _, num_states, num_actions = get_env_info(env_id)

    # seed
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    model = pickle.load(open(model_path, 'rb'))
    model.running_state.fix = True
    states, actions, rewards, dones, next_states = [], [], [], [], []

    for i_iter in range(1, n_trajs + 1):
        state = env.reset()
        ep_reward = 0
        n_step = 0

        ep_states, ep_actions, ep_rewards, ep_dones, ep_next_states = [], [], [], [], []
        while True:
            if render:
                env.render()
            normalized_state = model.running_state(state)
            action = model.choose_action(normalized_state)
            next_state, reward, done, _ = env.step(action)
            normalized_next_state = model.running_state(next_state)

            ep_reward += reward
            n_step += 1

            ep_states.append(state if obs_type == 0 else normalized_state)
            ep_actions.append(action)
            ep_rewards.append(reward)
            ep_dones.append(done)
            ep_next_states.append(next_state if obs_type == 0
                                  else
                                  normalized_next_state)

            if done:
                states.extend(ep_states)
                actions.extend(ep_actions)
                rewards.extend(ep_rewards)
                dones.extend(ep_dones)
                next_states.extend(ep_next_states)
                print(
                    f"Iter: {i_iter}, step: {n_step}, episode Reward: {ep_reward}")
                break
            state = next_state

    env.close()

    states = np.r_[states].reshape((-1, num_states))
    next_states = np.r_[next_states].reshape((-1, num_states))
    actions = np.r_[actions].reshape((-1, 1))
    rewards = np.r_[rewards].reshape((-1, 1))
    dones = np.r_[dones].reshape((-1, 1))

    numpy_dict = {
        'obs': states,
        'action': actions,
        'reward': rewards,
        'done': dones,
        'next_obs': next_states
    }  # type: Dict[str, np.ndarray]

    save_path = f"{data_path}/{env_id}" if data_path is not None else env_id
    np.savez(f"{save_path}.npz", **numpy_dict)