def __init__(self, base_model_paths, switch_path, device, soft_choice=False): super(SwitchController, self).__init__() self.base_models = [] for base_model_path in base_model_paths: base_model = Actor(state_size=2, action_size=1, seed=0, fc1_units=25).to(device) base_model.load_state_dict( torch.load(base_model_path, map_location=device)) base_model.eval() self.base_models.append(base_model) self.switch_model = DQN(2, 2).to(device) self.switch_model.load_state_dict( torch.load(switch_path, map_location=device)) self.switch_model.eval() self.soft_choice = soft_choice
# this file is to record the NN controller parameters into a txt file to be used # for Bernstein polynomial approximation by the tool of ReachNN from Model import IndividualModel, Actor import torch import numpy as np # NAME = 'direct_distill' # trained_model = IndividualModel(state_size=3, action_size=1, seed=0, fc1_units=25) # trained_model.load_state_dict(torch.load('./'+ NAME +'.pth')) # trained_model.eval() trained_model = Actor(state_size=3, action_size=1, seed=0, fc1_units=25) trained_model.load_state_dict(torch.load("./actors/actor_0.43600.pth")) trained_model.eval() bias_list = [] weight_list = [] for name, param in trained_model.named_parameters(): if 'bias' in name: bias_list.append(param.detach().cpu().numpy()) if 'weight' in name: weight_list.append(param.detach().cpu().numpy()) print(len(weight_list), np.linalg.norm(weight_list[0]), np.linalg.norm(weight_list[1])) # assert False all_param = [] for i in range(len(bias_list)): for j in range(len(bias_list[i])): for k in range(weight_list[i].shape[1]): all_param.append(weight_list[i][j, k]) all_param.append(bias_list[i][j])
USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda( ) if USE_CUDA else autograd.Variable(*args, **kwargs) batch_size = 128 gamma = 0.99 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 3000 replay_buffer = ReplayBuffer(int(5e3)) epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_1 = Actor(state_size=3, action_size=1, seed=0, fc1_units=25).to(device) model_1.load_state_dict(torch.load("./actors/actor_0.43600.pth")) model_1.eval() # model_2 = IndividualModel(state_size=3, action_size=1, seed=0, fc1_units=50).to(device) # model_2.load_state_dict(torch.load("./actors/actor_1.0_2800.pth")) # model_2.eval() def MController(state): action = 0.634 * state[0] - 0.296 * state[1] - 0.153 * state[ 2] + 0.053 * state[0]**2 - 1.215 * state[0]**3 return action Individual = IndividualModel(state_size=3, action_size=1, seed=0, fc1_units=25).to(device)
return len(self.buffer) USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) batch_size = 128 gamma = 0.99 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 3000 replay_buffer = ReplayBuffer(int(5e3)) epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_1 = Actor(state_size=2, action_size=1, seed=0, fc1_units=25, fc2_units=None).to(device) model_1.load_state_dict(torch.load("./models/actor_2800.pth")) model_1.eval() model_2 = Actor(state_size=2, action_size=1, seed=0, fc1_units=25).to(device) model_2.load_state_dict(torch.load("./0731actors/actor_2400.pth")) model_2.eval() Individual = Individualtanh(state_size=2, action_size=1, seed=0, fc1_units=25).to(device) agent = Agent(state_size=2, action_size=2, random_seed=0, fc1_units=None, fc2_units=None, weighted=True) ppo = PPO(2, 2, method = 'clip') ppo.load_model(3000, 1) def mkdir(path): folder = os.path.exists(path)
USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda( ) if USE_CUDA else autograd.Variable(*args, **kwargs) batch_size = 128 gamma = 0.99 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 3000 replay_buffer = ReplayBuffer(int(5e3)) epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_1 = Actor(state_size=4, action_size=1, seed=0).to(device) model_1.load_state_dict(torch.load("./actor5000_1.pth")) model_1.eval() model_2 = Actor(state_size=4, action_size=1, seed=0).to(device) model_2.load_state_dict(torch.load("./actor4850_1.pth")) model_2.eval() Individual = Individualtanh(state_size=4, action_size=1, seed=0, fc1_units=50).to(device) agent = Agent(state_size=4, action_size=2, random_seed=0) ppo = PPO(4, 2, method='penalty') ppo.load_model(5499, 1)
class DDPG: def __init__(self, env, tau=1e-3, gamma=0.99, batch_size=64, depsilon=50000): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.policy = Actor(self.num_states, self.num_actions).train() self.policy_target = Actor(self.num_states, self.num_actions).eval() self.hard_update(self.policy, self.policy_target) self.critic = Critic(self.num_states, self.num_actions).train() self.critic_target = Critic(self.num_states, self.num_actions).eval() self.hard_update(self.critic, self.critic_target) self.critic_loss = nn.MSELoss() self.batch_size = batch_size self.gamma = gamma self.tau = tau self.epsilon = 1.0 self.depsilon = 1.0 / float(depsilon) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=1e-3) self.opt_policy = torch.optim.Adam(self.policy.parameters(), lr=1e-4) self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() def train(self, buffer): b_state, b_action, b_reward, b_state_next, b_term = buffer.sample( self.batch_size) with torch.no_grad(): action_target = self.policy_target(b_state_next) Q_prime = self.critic_target(b_state_next, action_target) self.opt_critic.zero_grad() Q = self.critic(b_state, b_action) L_critic = self.critic_loss( Q, b_reward + self.gamma * Q_prime * (1.0 - b_term)) L_critic.backward() self.opt_critic.step() self.opt_policy.zero_grad() action = self.policy(b_state) L_Q = -1.0 * self.critic(b_state, action).mean() L_Q.backward() self.opt_policy.step() self.soft_update(self.critic, self.critic_target) self.soft_update(self.policy, self.policy_target) return L_critic.item(), L_Q.item() def get_entropy(self, buffer, m=5, n=100): # b_state, b_action, b_reward, b_state_next, b_term = buffer.sample(n) b_angle = torch.rand(n) * np.pi * 2.0 b_speed = 2.0 * (torch.rand(n) - 0.5) * 8.0 b_state = torch.stack( [torch.cos(b_angle), torch.sin(b_angle), b_speed], dim=1).to(device='cuda', dtype=torch.float32) coef = torch.zeros(n, dtype=b_state.dtype, device=b_state.device) with torch.no_grad(): action = self.policy(b_state) X, ind = torch.sort(action, dim=0) for i in range(n): if i < m: c = 1 a = X[i + m] b = X[0] elif i >= m and i < n - m: c = 2 a = X[i + m] b = X[i - m] else: c = 1 a = X[n - 1] b = X[i - m] coef[i] = float(n) * float(c) / float(m) * (a - b + 1E-5) S = torch.log(coef).mean() return S.item() def get_value(self, state, action): with torch.no_grad(): return self.critic(state, action).item() def select_action(self, state, random_process): with torch.no_grad(): action = self.policy(state) noise = max(self.epsilon, 0.0) * random_process.sample() self.epsilon -= self.depsilon action += torch.from_numpy(noise).to(device=action.device, dtype=action.dtype) action = torch.clamp(action, -1, 1) return action def random_action(self): m = Uniform(torch.tensor([-1.0 for i in range(self.num_actions)]), torch.tensor([1.0 for i in range(self.num_actions)])) return m.sample() def soft_update(self, src, dst): with torch.no_grad(): for src_param, dst_param in zip(src.parameters(), dst.parameters()): dst_param.copy_(self.tau * src_param + (1.0 - self.tau) * dst_param) def hard_update(self, src, dst): with torch.no_grad(): for src_param, dst_param in zip(src.parameters(), dst.parameters()): dst_param.copy_(src_param.clone()) def load_weights(self, path): self.policy.load_state_dict(torch.load('{}/policy.pkl'.format(path))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(path))) def save_model(self, path): torch.save( self.policy.to(device='cpu').state_dict(), '{}/policy.pkl'.format(path)) torch.save( self.critic.to(device='cpu').state_dict(), '{}/critic.pkl'.format(path))
num_train = 200000 num_eval = 0 buffer_length = 600000 # env = NormalizedEnv(gym.make('Pendulum-v0')) GODOT_BIN_PATH = "InvPendulum/InvPendulum.x86_64" env_abs_path = "InvPendulum/InvPendulum.pck" env = NormalizedEnv( InvPendulumEnv(exec_path=GODOT_BIN_PATH, env_path=env_abs_path, render=True)) num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] policy = Actor(num_states, num_actions) policy.load_state_dict(torch.load('./policy.pkl')) state = env.reset() state = state.to(dtype=torch.float32) traced_policy = torch.jit.trace(policy, state) print(traced_policy.graph) print(traced_policy.code) traced_policy.save('ddpg_policy.jit') for step in range(1000): action = policy(state) # torch.tensor([1.0 for i in range(num_actions)])).sample().to(device='cuda') time.sleep(0.02) # state_next, reward, term, _ = env.step(action.cpu().numpy())