def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, num_actions = get_env_info(
            self.env_id)
        tf.keras.backend.set_floatx('float64')
        # seeding
        np.random.seed(self.seed)
        tf.random.set_seed(self.seed)
        self.env.seed(self.seed)

        if env_continuous:
            self.policy_net = Policy(num_states, num_actions)  # current policy
        else:
            self.policy_net = DiscretePolicy(num_states, num_actions)

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_reinforce_tf2.p".format(self.env_id))
            self.running_state = pickle.load(
                open(
                    '{}/{}_reinforce_tf2.p'.format(self.model_path,
                                                   self.env_id), "rb"))
            self.policy_net.load_weights("{}/{}_reinforce_tf2".format(
                self.model_path, self.env_id))

        self.collector = MemoryCollector(self.env,
                                         self.policy_net,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_p = optim.Adam(lr=self.lr_p, clipnorm=20)
Example #2
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert env_continuous, "SAC is only applicable to continuous environment !!!!"

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        self.target_entropy = -np.prod(self.env.action_space.shape)
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(num_states,
                                 self.num_actions,
                                 max_action=self.action_high,
                                 use_sac=True).double().to(device)

        self.q_net_1 = QValue(num_states, self.num_actions).double().to(device)
        self.q_net_target_1 = QValue(num_states,
                                     self.num_actions).double().to(device)
        self.q_net_2 = QValue(num_states, self.num_actions).double().to(device)
        self.q_net_target_2 = QValue(num_states,
                                     self.num_actions).double().to(device)

        # self.alpha init
        self.alpha = torch.exp(torch.zeros(
            1, device=device).double()).requires_grad_()

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_sac_alpha.p".format(self.env_id))
            self.policy_net, self.q_net_1, self.q_net_2, self.running_state \
                = pickle.load(open('{}/{}_sac_alpha.p'.format(self.model_path, self.env_id), "rb"))

        self.q_net_target_1.load_state_dict(self.q_net_1.state_dict())
        self.q_net_target_2.load_state_dict(self.q_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(),
                                        lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(),
                                        lr=self.lr_q)
Example #3
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, num_actions = get_env_info(
            self.env_id
        )

        tf.keras.backend.set_floatx("float64")

        # seeding
        np.random.seed(self.seed)
        tf.random.set_seed(self.seed)
        self.env.seed(self.seed)

        if env_continuous:
            self.policy_net = Policy(num_states, num_actions)
        else:
            self.policy_net = DiscretePolicy(num_states, num_actions)

        self.value_net = Value(num_states, l2_reg=1e-3)
        self.running_state = ZFilter((num_states,), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_trpo_tf2.p".format(self.env_id))
            self.running_state = pickle.load(
                open(
                    "{}/{}_trpo_tf2.p".format(self.model_path, self.env_id),
                    "rb",
                )
            )
            self.policy_net.load_weights(
                "{}/{}_trpo_tf2_p".format(self.model_path, self.env_id)
            )
            self.value_net.load_weights(
                "{}/{}_trpo_tf2_v".format(self.model_path, self.env_id)
            )

        self.collector = MemoryCollector(
            self.env,
            self.policy_net,
            render=self.render,
            running_state=self.running_state,
            num_process=self.num_process,
        )

        self.optimizer_v = optim.Adam(lr=self.lr_v)
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert env_continuous, "TD3 is only applicable to continuous environment !!!!"

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(num_states, self.num_actions,
                                 self.action_high).double().to(device)
        self.policy_net_target = Policy(num_states, self.num_actions,
                                        self.action_high).double().to(device)

        self.value_net_1 = Value(num_states,
                                 self.num_actions).double().to(device)
        self.value_net_target_1 = Value(num_states,
                                        self.num_actions).double().to(device)
        self.value_net_2 = Value(num_states,
                                 self.num_actions).double().to(device)
        self.value_net_target_2 = Value(num_states,
                                        self.num_actions).double().to(device)

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_td3.p".format(self.env_id))
            self.policy_net, self.value_net_1, self.value_net_2, self.running_state = pickle.load(
                open('{}/{}_td3.p'.format(self.model_path, self.env_id), "rb"))

        self.policy_net_target.load_state_dict(self.policy_net.state_dict())
        self.value_net_target_1.load_state_dict(self.value_net_1.state_dict())
        self.value_net_target_2.load_state_dict(self.value_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v_1 = optim.Adam(self.value_net_1.parameters(),
                                        lr=self.lr_v)
        self.optimizer_v_2 = optim.Adam(self.value_net_2.parameters(),
                                        lr=self.lr_v)
Example #5
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, num_actions = get_env_info(
            self.env_id)

        # seeding
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        if env_continuous:
            self.policy_net = Policy(num_states, num_actions).double().to(
                device)  # current policy
            self.policy_net_old = Policy(num_states, num_actions).double().to(
                device)  # old policy
        else:
            self.policy_net = DiscretePolicy(num_states,
                                             num_actions).double().to(device)
            self.policy_net_old = DiscretePolicy(
                num_states, num_actions).double().to(device)

        self.value_net = Value(num_states).double().to(device)
        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_ppo.p".format(self.env_id))
            self.policy_net, self.value_net, self.running_state = pickle.load(
                open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb"))

        self.policy_net_old.load_state_dict(self.policy_net.state_dict())
        self.collector = MemoryCollector(self.env,
                                         self.policy_net_old,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)
ppo_epochs = 10

num_iters = 2000
env = gym.make(env_id)
# env = env.unwrapped

num_states = env.observation_space.shape[0]
if type(env.action_space) == Discrete:
    num_actions = env.action_space.n
else:
    num_actions = env.action_space.shape[0]

actor = ActorContinuous(num_states, num_actions).double().to(device)
critic = Critic(num_states).double().to(device)

running_state = ZFilter((num_states,), clip=5)
agent = MemoryCollector(env, actor, running_state=running_state, num_process=4)

opt_p = opt.Adam(actor.parameters(), lr=lr)
opt_v = opt.Adam(critic.parameters(), lr=lr)


def train(memory):
    batch = memory.sample()
    batch_states = DOUBLE(batch.state).to(device)
    batch_actions = DOUBLE(batch.action).to(device)
    batch_log_probs = DOUBLE(batch.log_prob).to(device)
    batch_masks = DOUBLE(batch.mask).to(device)
    batch_rewards = DOUBLE(batch.reward).to(device)
    batch_size = batch_states.shape[0]