Ejemplo n.º 1
0
 def save(self, save_path):
     """save model"""
     check_path(save_path)
     pickle.dump(
         (self.policy_net, self.value_net, self.running_state),
         open("{}/{}_trpo.p".format(save_path, self.env_id), "wb"),
     )
Ejemplo n.º 2
0
    def save(self, save_path):
        """save model"""
        check_path(save_path)

        pickle.dump(
            (self.policy_net, self.q_net_1, self.q_net_2, self.running_state),
            open('{}/{}_sac_alpha.p'.format(save_path, self.env_id), 'wb'))
Ejemplo n.º 3
0
 def save_model(self, save_path):
     check_path(save_path)
     # torch.save((self.discriminator, self.policy, self.value), f"{save_path}/{self.exp_name}.pt")
     torch.save(self.discriminator,
                f"{save_path}/{self.env_id}_Discriminator.pt")
     torch.save(self.policy, f"{save_path}/{self.env_id}_Policy.pt")
     torch.save(self.value, f"{save_path}/{self.env_id}_Value.pt")
Ejemplo n.º 4
0
 def save(self, save_path):
     """save model"""
     check_path(save_path)
     pickle.dump(self.running_state,
                 open('{}/{}_dueling_dqn_tf2.p'.format(save_path, self.env_id), 'wb'))
     self.value_net.save_weights(
         "{}/{}_dueling_dqn_tf2".format(save_path, self.env_id))
Ejemplo n.º 5
0
 def save(self, save_path):
     """save model"""
     check_path(save_path)
     pickle.dump(
         self.running_state,
         open('{}/{}_reinforce_tf2.p'.format(save_path, self.env_id), 'wb'))
     self.policy_net.save_weights("{}/{}_reinforce_tf2".format(
         save_path, self.env_id))
Ejemplo n.º 6
0
 def save(self, save_path):
     """save model"""
     check_path(save_path)
     pickle.dump(
         self.running_state,
         open("{}/{}_trpo_tf2.p".format(save_path, self.env_id), "wb"),
     )
     self.policy_net.save_weights(
         "{}/{}_trpo_tf2_p".format(save_path, self.env_id)
     )
     self.value_net.save_weights(
         "{}/{}_trpo_tf2_v".format(save_path, self.env_id)
     )
Ejemplo n.º 7
0
 def save(self, save_path):
     """save model"""
     check_path(save_path)
     pickle.dump((self.policy_net, self.value_net, self.running_state),
                 open('{}/{}_ppo.p'.format(save_path, self.env_id), 'wb'))
Ejemplo n.º 8
0
 def save(self, save_path):
     """save model"""
     check_path(save_path)
     pickle.dump((self.ac_net, self.running_state),
                 open('{}/{}_a2c.p'.format(save_path, self.env_id), 'wb'))
Ejemplo n.º 9
0
def main(env_id, n_trajs, model_path, data_path, render, seed, obs_type):
    """
    Collect trajectories from pre-trained models by PPO
    """
    if data_path is not None:
        check_path(data_path)

    env, _, num_states, num_actions = get_env_info(env_id)

    # seed
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    model = pickle.load(open(model_path, 'rb'))
    model.running_state.fix = True
    states, actions, rewards, dones, next_states = [], [], [], [], []

    for i_iter in range(1, n_trajs + 1):
        state = env.reset()
        ep_reward = 0
        n_step = 0

        ep_states, ep_actions, ep_rewards, ep_dones, ep_next_states = [], [], [], [], []
        while True:
            if render:
                env.render()
            normalized_state = model.running_state(state)
            action = model.choose_action(normalized_state)
            next_state, reward, done, _ = env.step(action)
            normalized_next_state = model.running_state(next_state)

            ep_reward += reward
            n_step += 1

            ep_states.append(state if obs_type == 0 else normalized_state)
            ep_actions.append(action)
            ep_rewards.append(reward)
            ep_dones.append(done)
            ep_next_states.append(next_state if obs_type == 0
                                  else
                                  normalized_next_state)

            if done:
                states.extend(ep_states)
                actions.extend(ep_actions)
                rewards.extend(ep_rewards)
                dones.extend(ep_dones)
                next_states.extend(ep_next_states)
                print(
                    f"Iter: {i_iter}, step: {n_step}, episode Reward: {ep_reward}")
                break
            state = next_state

    env.close()

    states = np.r_[states].reshape((-1, num_states))
    next_states = np.r_[next_states].reshape((-1, num_states))
    actions = np.r_[actions].reshape((-1, 1))
    rewards = np.r_[rewards].reshape((-1, 1))
    dones = np.r_[dones].reshape((-1, 1))

    numpy_dict = {
        'obs': states,
        'action': actions,
        'reward': rewards,
        'done': dones,
        'next_obs': next_states
    }  # type: Dict[str, np.ndarray]

    save_path = f"{data_path}/{env_id}" if data_path is not None else env_id
    np.savez(f"{save_path}.npz", **numpy_dict)