def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions).to(device) else: self.policy_net = DiscretePolicy(num_states, num_actions).to(device) self.value_net = Value(num_states).to(device) self.ac_net = Actor_Critic(self.policy_net, self.value_net).to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_a2c.p".format(self.env_id)) self.ac_net, self.running_state = pickle.load( open('{}/{}_a2c.p'.format(self.model_path, self.env_id), "rb")) self.collector = MemoryCollector(self.env, self.ac_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_ac = optim.Adam(self.ac_net.parameters(), lr=self.lr_ac)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) tf.keras.backend.set_floatx('float64') # seeding np.random.seed(self.seed) tf.random.set_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions) # current policy else: self.policy_net = DiscretePolicy(num_states, num_actions) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_reinforce_tf2.p".format(self.env_id)) self.running_state = pickle.load( open( '{}/{}_reinforce_tf2.p'.format(self.model_path, self.env_id), "rb")) self.policy_net.load_weights("{}/{}_reinforce_tf2".format( self.model_path, self.env_id)) self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(lr=self.lr_p, clipnorm=20)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert not env_continuous, "DQN is only applicable to discontinuous environment !!!!" # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) # initialize networks self.value_net = QNet_dqn(num_states, self.num_actions).to(device) self.value_net_target = QNet_dqn(num_states, self.num_actions).to(device) self.running_state = ZFilter((num_states, ), clip=5) # load model if necessary if self.model_path: print("Loading Saved Model {}_dqn.p".format(self.env_id)) self.value_net, self.running_state = pickle.load( open('{}/{}_dqn.p'.format(self.model_path, self.env_id), "rb")) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer = optim.Adam(self.value_net.parameters(), lr=self.lr_q)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert not env_continuous, "DoubleQN is only applicable to discontinuous environment !!!!" tf.keras.backend.set_floatx('float64') # seeding np.random.seed(self.seed) tf.random.set_seed(self.seed) self.env.seed(self.seed) # initialize networks self.value_net = QNet_dqn(num_states, self.num_actions) self.value_net_target = QNet_dqn(num_states, self.num_actions) self.running_state = ZFilter((num_states, ), clip=5) # load model if necessary if self.model_path: print("Loading Saved Model {}_double_dqn_tf2.p".format( self.env_id)) self.running_state = pickle.load( open( '{}/{}_double_dqn_tf2.p'.format(self.model_path, self.env_id), "rb")) self.value_net.load_weights("{}/{}_double_dqn_tf2".format( self.model_path, self.env_id)) self.value_net_target.set_weights(self.value_net.get_weights()) self.optimizer = optim.Adam(lr=self.lr_q)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info(self.env_id) assert env_continuous, "SAC is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, max_action=self.action_high).double().to(device) self.value_net = Value(num_states).double().to(device) self.value_net_target = Value(num_states).double().to(device) self.q_net_1 = QValue(num_states, self.num_actions).double().to(device) self.q_net_2 = QValue(num_states, self.num_actions).double().to(device) self.running_state = ZFilter((num_states,), clip=5) if self.model_path: print("Loading Saved Model {}_sac.p".format(self.env_id)) self.policy_net, self.value_net, self.q_net_1, self.q_net_2, self.running_state \ = pickle.load(open('{}/{}_sac.p'.format(self.model_path, self.env_id), "rb")) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)
def main(env_id, n_trajs, model_path, data_path, render, seed): """ Collect trajectories from pre-trained models by PPO """ env, _, num_states, num_actions = get_env_info(env_id) # seed env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) states, actions, rewards, ep_rewards = [], [], [], [] model = pickle.load(open(model_path, 'rb')) model.running_state.fix = True for i_iter in range(1, n_trajs + 1): state = env.reset() ep_reward = 0 n_step = 0 while True: if render: env.render() state = model.running_state(state) action, _ = model.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = env.step(action) ep_reward += reward n_step += 1 states.append(state) actions.append(action) rewards.append(reward) if done: ep_rewards.append(ep_reward) print( f"Iter: {i_iter}, step: {n_step}, episode Reward: {ep_reward}" ) break env.close() states = np.r_[states].reshape((-1, num_states)) actions = np.r_[actions].reshape((-1, num_actions)) rewards = np.r_[rewards].reshape((-1, 1)) ep_rewards = np.r_[ep_rewards].reshape((n_trajs, -1)) numpy_dict = { 'state': states, 'action': actions, 'reward': rewards, 'ep_reward': ep_rewards, } # type: Dict[str, np.ndarray] if data_path is not None: np.savez(f"{data_path}/{env_id}.npz", **numpy_dict)
def _init_model(self): # seeding seed = self.config["train"]["general"]["seed"] torch.manual_seed(seed) np.random.seed(seed) self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # check env assert num_states == self.expert_dataset.num_states and num_actions == self.expert_dataset.num_actions, \ "Expected corresponding expert dataset and env" dim_dict = {"dim_state": num_states, "dim_action": num_actions} self.config["value"].update(dim_dict) self.config["policy"].update(dim_dict) self.config["discriminator"].update(dim_dict) self.value = Value(dim_state=self.config["value"]["dim_state"], dim_hidden=self.config["value"]["dim_hidden"], activation=resolve_activate_function( self.config["value"]["activation"])) self.policy = Policy(config=self.config["policy"]) self.discriminator = Discriminator( dim_state=self.config["discriminator"]["dim_state"], dim_action=self.config["discriminator"]["dim_action"], dim_hidden=self.config["discriminator"]["dim_hidden"], activation=resolve_activate_function( self.config["discriminator"]["activation"])) self.discriminator_func = nn.BCELoss() self.running_state = None self.collector = MemoryCollector(self.env, self.policy, render=self.render, running_state=self.running_state, num_process=self.num_process) print("Model Structure") print(self.policy) print(self.value) print(self.discriminator) print() self.optimizer_policy = optim.Adam( self.policy.parameters(), lr=self.config["policy"]["learning_rate"]) self.optimizer_value = optim.Adam( self.value.parameters(), lr=self.config["value"]["learning_rate"]) self.optimizer_discriminator = optim.Adam( self.discriminator.parameters(), lr=self.config["discriminator"]["learning_rate"]) to_device(self.value, self.policy, self.discriminator, self.discriminator_func)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert env_continuous, "SAC is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] self.target_entropy = -np.prod(self.env.action_space.shape) # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, max_action=self.action_high, use_sac=True).double().to(device) self.q_net_1 = QValue(num_states, self.num_actions).double().to(device) self.q_net_target_1 = QValue(num_states, self.num_actions).double().to(device) self.q_net_2 = QValue(num_states, self.num_actions).double().to(device) self.q_net_target_2 = QValue(num_states, self.num_actions).double().to(device) # self.alpha init self.alpha = torch.exp(torch.zeros( 1, device=device).double()).requires_grad_() self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_sac_alpha.p".format(self.env_id)) self.policy_net, self.q_net_1, self.q_net_2, self.running_state \ = pickle.load(open('{}/{}_sac_alpha.p'.format(self.model_path, self.env_id), "rb")) self.q_net_target_1.load_state_dict(self.q_net_1.state_dict()) self.q_net_target_2.load_state_dict(self.q_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id ) tf.keras.backend.set_floatx("float64") # seeding np.random.seed(self.seed) tf.random.set_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions) else: self.policy_net = DiscretePolicy(num_states, num_actions) self.value_net = Value(num_states, l2_reg=1e-3) self.running_state = ZFilter((num_states,), clip=5) if self.model_path: print("Loading Saved Model {}_trpo_tf2.p".format(self.env_id)) self.running_state = pickle.load( open( "{}/{}_trpo_tf2.p".format(self.model_path, self.env_id), "rb", ) ) self.policy_net.load_weights( "{}/{}_trpo_tf2_p".format(self.model_path, self.env_id) ) self.value_net.load_weights( "{}/{}_trpo_tf2_v".format(self.model_path, self.env_id) ) self.collector = MemoryCollector( self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process, ) self.optimizer_v = optim.Adam(lr=self.lr_v)
def main(env_id, n_trajs, model_path, data_path, render, seed, obs_type): """ Collect trajectories from pre-trained models by PPO """ if data_path is not None: check_path(data_path) env, _, num_states, num_actions = get_env_info(env_id) # seed env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) model = pickle.load(open(model_path, 'rb')) model.running_state.fix = True states, actions, rewards, dones, next_states = [], [], [], [], [] for i_iter in range(1, n_trajs + 1): state = env.reset() ep_reward = 0 n_step = 0 ep_states, ep_actions, ep_rewards, ep_dones, ep_next_states = [], [], [], [], [] while True: if render: env.render() normalized_state = model.running_state(state) action = model.choose_action(normalized_state) next_state, reward, done, _ = env.step(action) normalized_next_state = model.running_state(next_state) ep_reward += reward n_step += 1 ep_states.append(state if obs_type == 0 else normalized_state) ep_actions.append(action) ep_rewards.append(reward) ep_dones.append(done) ep_next_states.append(next_state if obs_type == 0 else normalized_next_state) if done: states.extend(ep_states) actions.extend(ep_actions) rewards.extend(ep_rewards) dones.extend(ep_dones) next_states.extend(ep_next_states) print( f"Iter: {i_iter}, step: {n_step}, episode Reward: {ep_reward}") break state = next_state env.close() states = np.r_[states].reshape((-1, num_states)) next_states = np.r_[next_states].reshape((-1, num_states)) actions = np.r_[actions].reshape((-1, 1)) rewards = np.r_[rewards].reshape((-1, 1)) dones = np.r_[dones].reshape((-1, 1)) numpy_dict = { 'obs': states, 'action': actions, 'reward': rewards, 'done': dones, 'next_obs': next_states } # type: Dict[str, np.ndarray] save_path = f"{data_path}/{env_id}" if data_path is not None else env_id np.savez(f"{save_path}.npz", **numpy_dict)