def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON_MAX # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0
def __init__(self, env, tau=1e-3, gamma=0.99, batch_size=64, depsilon=50000): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.policy = Actor(self.num_states, self.num_actions).train() self.policy_target = Actor(self.num_states, self.num_actions).eval() self.hard_update(self.policy, self.policy_target) self.critic = Critic(self.num_states, self.num_actions).train() self.critic_target = Critic(self.num_states, self.num_actions).eval() self.hard_update(self.critic, self.critic_target) self.critic_loss = nn.MSELoss() self.batch_size = batch_size self.gamma = gamma self.tau = tau self.epsilon = 1.0 self.depsilon = 1.0 / float(depsilon) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=1e-3) self.opt_policy = torch.optim.Adam(self.policy.parameters(), lr=1e-4) self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda()
def play(env: gym.Env, path): state_shape = env.observation_space.shape action_shape = env.action_space.shape ubound = env.action_space.high lbound = env.action_space.low actor = Actor(state_shape, action_shape, ubound) critic = Critic(state_shape, action_shape) agent = DDPG(actor, critic) agent.load(path + "/whole_model/save400") print(actor.model.summary()) print(critic.model.summary()) print(actor.target_model.summary()) print(critic.target_model.summary()) for time in range(EPISODES): state = env.reset() score = 0 for _ in range(1000): env.render() action = agent.choose_action(np.array([state]))[0] state, reward, done, info = env.step(action) score += reward print(score) env.close()
def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize Agent Object. ---------------------------------------- Parameters ---------------------------------------- state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed(int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM) self.timestep = 0 # <--------------- Actor Network -----------> self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # <--------------- Critic Network ----------> self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # <--------------- Noise ---------------> self.noise = OUNoise((num_agents, action_size), random_seed) # <----------- Replay Memory -------------> self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, random_seed, num_agents=1): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents #Raw and Targer Actor Network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) ##copying the weights of the raw network to the target network for target, local in zip(self.actor_target.parameters(), self.actor_local.parameters()): target.data.copy_(local.data) #Raw and Target CRITIC Network self.critic_local = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) ##copying the weights of the raw network to the target network for target, local in zip(self.critic_target.parameters(), self.critic_local.parameters()): target.data.copy_(local.data) ##Creating Noise Process self.noise = OrnUhlNoise(action_size, random_seed) ###Replay Memory; in MADDPG, the replay buffer is common to all agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def train(env: gym.Env, path): state_shape = env.observation_space.shape action_shape = env.action_space.shape ubound = env.action_space.high lbound = env.action_space.low # print(state_shape, action_shape) actor = Actor(state_shape, action_shape, ubound) critic = Critic(state_shape, action_shape) agent = DDPG(actor, critic) print(actor.model.summary()) print(critic.model.summary()) print(actor.target_model.summary()) print(critic.target_model.summary()) for time in range(EPISODES): state = env.reset() score = 0 agent.noise_generator.reset() for _ in range(1000): if time > EPISODES / 5: # for learning optimal solution action = agent.choose_action(np.array([state]))[0] else: action = np.clip( agent.get_action_with_noise(np.array([state])), lbound, ubound) next_state, reward, done, info = env.step(action) score += reward if not done: agent.memory.add([state, action, reward, next_state, 1.]) state = next_state if len(agent.memory.memory) > BATCH_SIZE: samples = agent.memory.sample(BATCH_SIZE) agent.train(samples) else: agent.memory.add([state, action, reward, next_state, 0.]) break print(time, score) agent.update_all_target() if time % 500 == 0: agent.save(path + "/whole_model/save" + str(time))
def __init__(self, base_model_paths, switch_path, device, soft_choice=False): super(SwitchController, self).__init__() self.base_models = [] for base_model_path in base_model_paths: base_model = Actor(state_size=2, action_size=1, seed=0, fc1_units=25).to(device) base_model.load_state_dict( torch.load(base_model_path, map_location=device)) base_model.eval() self.base_models.append(base_model) self.switch_model = DQN(2, 2).to(device) self.switch_model.load_state_dict( torch.load(switch_path, map_location=device)) self.switch_model.eval() self.soft_choice = soft_choice
USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda( ) if USE_CUDA else autograd.Variable(*args, **kwargs) batch_size = 128 gamma = 0.99 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 3000 replay_buffer = ReplayBuffer(int(5e3)) epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_1 = Actor(state_size=3, action_size=1, seed=0, fc1_units=25).to(device) model_1.load_state_dict(torch.load("./actors/actor_0.43600.pth")) model_1.eval() # model_2 = IndividualModel(state_size=3, action_size=1, seed=0, fc1_units=50).to(device) # model_2.load_state_dict(torch.load("./actors/actor_1.0_2800.pth")) # model_2.eval() def MController(state): action = 0.634 * state[0] - 0.296 * state[1] - 0.153 * state[ 2] + 0.053 * state[0]**2 - 1.215 * state[0]**3 return action Individual = IndividualModel(state_size=3, action_size=1, seed=0,
def __len__(self): return len(self.buffer) USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) batch_size = 128 gamma = 0.99 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 3000 replay_buffer = ReplayBuffer(int(5e3)) epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_1 = Actor(state_size=2, action_size=1, seed=0, fc1_units=25, fc2_units=None).to(device) model_1.load_state_dict(torch.load("./models/actor_2800.pth")) model_1.eval() model_2 = Actor(state_size=2, action_size=1, seed=0, fc1_units=25).to(device) model_2.load_state_dict(torch.load("./0731actors/actor_2400.pth")) model_2.eval() Individual = Individualtanh(state_size=2, action_size=1, seed=0, fc1_units=25).to(device) agent = Agent(state_size=2, action_size=2, random_seed=0, fc1_units=None, fc2_units=None, weighted=True) ppo = PPO(2, 2, method = 'clip') ppo.load_model(3000, 1) def mkdir(path):
USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda( ) if USE_CUDA else autograd.Variable(*args, **kwargs) batch_size = 128 gamma = 0.99 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 3000 replay_buffer = ReplayBuffer(int(5e3)) epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_1 = Actor(state_size=4, action_size=1, seed=0).to(device) model_1.load_state_dict(torch.load("./actor5000_1.pth")) model_1.eval() model_2 = Actor(state_size=4, action_size=1, seed=0).to(device) model_2.load_state_dict(torch.load("./actor4850_1.pth")) model_2.eval() Individual = Individualtanh(state_size=4, action_size=1, seed=0, fc1_units=50).to(device) agent = Agent(state_size=4, action_size=2, random_seed=0) ppo = PPO(4, 2, method='penalty') ppo.load_model(5499, 1)
class Agent(): """ The Agent interact with and learn from the Environment """ def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize Agent Object. ---------------------------------------- Parameters ---------------------------------------- state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed(int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM) self.timestep = 0 # <--------------- Actor Network -----------> self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # <--------------- Critic Network ----------> self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # <--------------- Noise ---------------> self.noise = OUNoise((num_agents, action_size), random_seed) # <----------- Replay Memory -------------> self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number): """ Save Experience in Replay Memory and select randomly from the buffer to learn """ self.timestep += 1 # <-------Save Experience ---------> self.memory.add(state, action, reward, next_state, done) # <-------- Learn at given interval, if enough samples are available in the memory -----------> if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) # <-------Obtain Action------------> def act(self, states, add_noise): """ Returns Actions for both agents given their respective states and based on the current Policy """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # Obtain action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() #add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """ Update the policy and Value Parameters using given batch of experience tuples Q_targets = r + y * critic_target(next_state, actor_target(next_state)) actor_target(state) --- Action critic_target(state, action) --- Q-Value ----------------------------------------- Parameters ----------------------------------------- experiences (Tuple[torch.Tensor]) -- tuple(s,a,r,s',done) gamma (float) -- discount factor """ states, actions, rewards, next_states, dones = experiences # <----------------------- Update the Critic --------------------> #Get predicted next-state actions and Q-values from target models actions_next = self.actor_target(next_states) # Construct Next actions vector relative to the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q tarets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute Critic Loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the Loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # <----------------------- Update the Actor --------------------> # Compute Loss actions_pred = self.actor_local(states) # Construct action prediction Vector relative to reach agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute Actor Loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the Loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # <----------------------- Update the Target Networks --------------------> self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # <----------------------- Update the noise --------------------> self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() # <----------------------- Perform Soft Update --------------------> def soft_update(self, local_model, target_model, tau): """ Soft Update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target --------------------------- Parameters --------------------------- local_model: Weights will be copied fron this pytorch model target_model: weights will be copied to this pytorch model tau (float): Interpolation Parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, random_seed, num_agents=1): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents #Raw and Targer Actor Network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) ##copying the weights of the raw network to the target network for target, local in zip(self.actor_target.parameters(), self.actor_local.parameters()): target.data.copy_(local.data) #Raw and Target CRITIC Network self.critic_local = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) ##copying the weights of the raw network to the target network for target, local in zip(self.critic_target.parameters(), self.critic_local.parameters()): target.data.copy_(local.data) ##Creating Noise Process self.noise = OrnUhlNoise(action_size, random_seed) ###Replay Memory; in MADDPG, the replay buffer is common to all agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Shared Memory; to save experiences in replay memory, and use random sample from buffer to learn""" #check if this is accurately implemented in training the MADDPG agent self.memory.add(state, action, reward, next_state, done) #start learning if the buffer size is full if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, noise=0.0): """uses current ploicy to output the next action""" ''' Please understand the below code snippet in detail ''' state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if ADD_OU_NOISE: action += self.noise.sample() * noise return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ''' only used in traininf DDPG agent, not for MADDPG''' #updates policy and value params using a givrn batch of experience tuples states, actions, rewards, next_states, dones = experiences #################update critic################################ next_actions = self.actor_target(next_states) next_Q_targets = self.critic_target(next_states, next_actions) Q_targets = rewards + (gamma * next_Q_targets * (1 - dones)) #Q targets for current states Q_Expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_Expected, Q_targets) #minimizing the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ###############update actor################################## ##computing actions_loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() ##minimizing the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ###############update target networks###################### self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, env, tau=1e-3, gamma=0.99, batch_size=64, depsilon=50000): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.policy = Actor(self.num_states, self.num_actions).train() self.policy_target = Actor(self.num_states, self.num_actions).eval() self.hard_update(self.policy, self.policy_target) self.critic = Critic(self.num_states, self.num_actions).train() self.critic_target = Critic(self.num_states, self.num_actions).eval() self.hard_update(self.critic, self.critic_target) self.critic_loss = nn.MSELoss() self.batch_size = batch_size self.gamma = gamma self.tau = tau self.epsilon = 1.0 self.depsilon = 1.0 / float(depsilon) self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=1e-3) self.opt_policy = torch.optim.Adam(self.policy.parameters(), lr=1e-4) self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() def train(self, buffer): b_state, b_action, b_reward, b_state_next, b_term = buffer.sample( self.batch_size) with torch.no_grad(): action_target = self.policy_target(b_state_next) Q_prime = self.critic_target(b_state_next, action_target) self.opt_critic.zero_grad() Q = self.critic(b_state, b_action) L_critic = self.critic_loss( Q, b_reward + self.gamma * Q_prime * (1.0 - b_term)) L_critic.backward() self.opt_critic.step() self.opt_policy.zero_grad() action = self.policy(b_state) L_Q = -1.0 * self.critic(b_state, action).mean() L_Q.backward() self.opt_policy.step() self.soft_update(self.critic, self.critic_target) self.soft_update(self.policy, self.policy_target) return L_critic.item(), L_Q.item() def get_entropy(self, buffer, m=5, n=100): # b_state, b_action, b_reward, b_state_next, b_term = buffer.sample(n) b_angle = torch.rand(n) * np.pi * 2.0 b_speed = 2.0 * (torch.rand(n) - 0.5) * 8.0 b_state = torch.stack( [torch.cos(b_angle), torch.sin(b_angle), b_speed], dim=1).to(device='cuda', dtype=torch.float32) coef = torch.zeros(n, dtype=b_state.dtype, device=b_state.device) with torch.no_grad(): action = self.policy(b_state) X, ind = torch.sort(action, dim=0) for i in range(n): if i < m: c = 1 a = X[i + m] b = X[0] elif i >= m and i < n - m: c = 2 a = X[i + m] b = X[i - m] else: c = 1 a = X[n - 1] b = X[i - m] coef[i] = float(n) * float(c) / float(m) * (a - b + 1E-5) S = torch.log(coef).mean() return S.item() def get_value(self, state, action): with torch.no_grad(): return self.critic(state, action).item() def select_action(self, state, random_process): with torch.no_grad(): action = self.policy(state) noise = max(self.epsilon, 0.0) * random_process.sample() self.epsilon -= self.depsilon action += torch.from_numpy(noise).to(device=action.device, dtype=action.dtype) action = torch.clamp(action, -1, 1) return action def random_action(self): m = Uniform(torch.tensor([-1.0 for i in range(self.num_actions)]), torch.tensor([1.0 for i in range(self.num_actions)])) return m.sample() def soft_update(self, src, dst): with torch.no_grad(): for src_param, dst_param in zip(src.parameters(), dst.parameters()): dst_param.copy_(self.tau * src_param + (1.0 - self.tau) * dst_param) def hard_update(self, src, dst): with torch.no_grad(): for src_param, dst_param in zip(src.parameters(), dst.parameters()): dst_param.copy_(src_param.clone()) def load_weights(self, path): self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(path))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(path))) def save_model(self, path): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
# this file is to record the NN controller parameters into a txt file to be used # for Bernstein polynomial approximation by the tool of ReachNN from Model import IndividualModel, Actor import torch import numpy as np # NAME = 'direct_distill' # trained_model = IndividualModel(state_size=3, action_size=1, seed=0, fc1_units=25) # trained_model.load_state_dict(torch.load('./'+ NAME +'.pth')) # trained_model.eval() trained_model = Actor(state_size=3, action_size=1, seed=0, fc1_units=25) trained_model.load_state_dict(torch.load("./actors/actor_0.43600.pth")) trained_model.eval() bias_list = [] weight_list = [] for name, param in trained_model.named_parameters(): if 'bias' in name: bias_list.append(param.detach().cpu().numpy()) if 'weight' in name: weight_list.append(param.detach().cpu().numpy()) print(len(weight_list), np.linalg.norm(weight_list[0]), np.linalg.norm(weight_list[1])) # assert False all_param = [] for i in range(len(bias_list)): for j in range(len(bias_list[i])): for k in range(weight_list[i].shape[1]): all_param.append(weight_list[i][j, k]) all_param.append(bias_list[i][j])
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def hard_copy_weights(self, target, source): """ copy weights from source to target network (part of initialization)""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, fc1_units, fc2_units, weighted=False, individual=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON_MAX # Actor Network (w/ Target Network) if weighted: self.actor_local = Weight_adapter(state_size, action_size).to(device) self.actor_target = Weight_adapter(state_size, action_size).to(device) elif individual: self.actor_local = IndividualModel(state_size, action_size, random_seed, fc1_units).to(device) self.actor_target = IndividualModel(state_size, action_size, random_seed, fc1_units).to(device) else: self.actor_local = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0 def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) if len(self.memory) > LEARN_START: # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() #print(action) self.actor_local.train() if add_noise: tem_noise = self.noise.sample() action += self.epsilon * tem_noise # print(tem_noise, np.clip(action, -1, 1)) return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # if self.epsilon - EPSILON_DECAY > EPSILON_MIN: self.epsilon -= EPSILON_DECAY else: self.epsilon = EPSILON_MIN self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. ?_target = t*?_local + (1 - t)*?_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
num_warmup = 1000 num_train = 200000 num_eval = 0 buffer_length = 600000 # env = NormalizedEnv(gym.make('Pendulum-v0')) GODOT_BIN_PATH = "InvPendulum/InvPendulum.x86_64" env_abs_path = "InvPendulum/InvPendulum.pck" env = NormalizedEnv( InvPendulumEnv(exec_path=GODOT_BIN_PATH, env_path=env_abs_path, render=True)) num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] policy = Actor(num_states, num_actions) policy.load_state_dict(torch.load('./policy.pkl')) state = env.reset() state = state.to(dtype=torch.float32) traced_policy = torch.jit.trace(policy, state) print(traced_policy.graph) print(traced_policy.code) traced_policy.save('ddpg_policy.jit') for step in range(1000): action = policy(state) # torch.tensor([1.0 for i in range(num_actions)])).sample().to(device='cuda') time.sleep(0.02)