for epoch in range(args.num_epochs): start_time = timer() # Variables for recording statistics. average_critic_real_performance = 0.0 # C(x) - The critic wants this to be as big as possible for real images. average_critic_generated_performance = 0.0 # C(G(x)) - The critic wants this to be as small as possible for generated images. average_critic_loss = 0.0 average_generator_loss = 0.0 # Train: perform 'args.epoch_length' mini-batch updates per "epoch". for i in range(args.epoch_length): total_training_steps += 1 # Train the critic: for i in range(args.num_critic_training_steps): critic_model.zero_grad() # Evaluate a mini-batch of real images. random_indexes = np.random.choice(len(images), args.mini_batch_size) real_images = torch.tensor(images[random_indexes], device=DEVICE) real_scores = critic_model(real_images) # Evaluate a mini-batch of generated images. random_latent_space_vectors = torch.randn(args.mini_batch_size, 512, device=DEVICE) generated_images = generator_model(random_latent_space_vectors) generated_scores = critic_model(generated_images.detach())
class Preyer: def __init__(self, s_dim, a_dim, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.device = 'cuda' if self.config.use_cuda else 'cpu' self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, 1) self.critic_target = Critic(s_dim, a_dim, 1) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.c_loss = 0 self.a_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def choose_action(self, s, noisy=True): if self.config.use_cuda: s = Variable(torch.cuda.FloatTensor(s)) else: s = Variable(torch.FloatTensor(s)) a = self.actor.forward(s).cpu().detach().numpy() if noisy: a += max(self.epsilon, 0.001) * self.random_process.sample() self.epsilon -= self.depsilon a = np.clip(a, -1., 1.) return np.array([a]) def random_action(self): action = np.random.uniform(low=-1., high=1., size=(1, self.a_dim)) return action def reset(self): self.random_process.reset_states() def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = Variable(torch.Tensor(state_batches).to(self.device)) action_batches = Variable( torch.Tensor(action_batches).reshape(-1, 1).to(self.device)) reward_batches = Variable( torch.Tensor(reward_batches).reshape(-1, 1).to(self.device)) next_state_batches = Variable( torch.Tensor(next_state_batches).to(self.device)) done_batches = Variable( torch.Tensor( (done_batches == False) * 1).reshape(-1, 1).to(self.device)) target_next_actions = self.actor_target.forward( next_state_batches).detach() target_next_q = self.critic_target.forward( next_state_batches, target_next_actions).detach() main_q = self.critic(state_batches, action_batches) # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines) loss_critic.backward() self.critic_optimizer.step() # Actor Loss self.actor.zero_grad() clear_action_batches = self.actor.forward(state_batches) loss_actor = ( -self.critic.forward(state_batches, clear_action_batches)).mean() loss_actor.backward() self.actor_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.actor, self.actor_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def getLoss(self): return self.c_loss, self.a_loss
class BiCNet(): def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list() def choose_action(self, obs, noisy=True): obs = torch.Tensor([obs]).to(self.device) action = self.policy(obs).cpu().detach().numpy()[0] self.action_log.append(action) if noisy: for agent_idx in range(self.n_agents): pass # action[agent_idx] += self.epsilon * self.random_process.sample() self.epsilon -= self.depsilon self.epsilon = max(self.epsilon, 0.001) np.clip(action, -1., 1.) return action def reset(self): self.random_process.reset_states() self.action_log.clear() def prep_train(self): self.policy.train() self.critic.train() self.policy_target.train() self.critic_target.train() def prep_eval(self): self.policy.eval() self.critic.eval() self.policy_target.eval() self.critic_target.eval() def random_action(self): return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2)) def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = torch.Tensor(state_batches).to(self.device) action_batches = torch.Tensor(action_batches).to(self.device) reward_batches = torch.Tensor(reward_batches).reshape( self.config.batch_size, self.n_agents, 1).to(self.device) next_state_batches = torch.Tensor(next_state_batches).to(self.device) done_batches = torch.Tensor( (done_batches == False) * 1).reshape(self.config.batch_size, self.n_agents, 1).to(self.device) target_next_actions = self.policy_target.forward(next_state_batches) target_next_q = self.critic_target.forward(next_state_batches, target_next_actions) main_q = self.critic(state_batches, action_batches) ''' How to concat each agent's Q value? ''' #target_next_q = target_next_q #main_q = main_q.mean(dim=1) ''' Reward Norm ''' # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024 # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines.detach()) loss_critic.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optimizer.step() # Actor Loss self.policy.zero_grad() clear_action_batches = self.policy.forward(state_batches) loss_actor = -self.critic.forward(state_batches, clear_action_batches).mean() loss_actor += (clear_action_batches**2).mean() * 1e-3 loss_actor.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5) self.policy_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.policy, self.policy_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def get_loss(self): return self.c_loss, self.a_loss def get_action_std(self): return np.array(self.action_log).std(axis=-1).mean()
class Agent: def __init__(self, replay_buffer, noise, state_dim, action_dim, seed, fc1_units = 256, fc2_units = 128, device="cpu", lr_actor=1e-4, lr_critic=1e-3, batch_size=128, discount=0.99, tau=1e-3): torch.manual_seed(seed) self.actor_local = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_local = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(params=self.critic_local.parameters(), lr=lr_critic) self.actor_target = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_target = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.buffer = replay_buffer self.noise = noise self.device = device self.batch_size = batch_size self.discount = discount self.tau = tau Agent.hard_update(model_local=self.actor_local, model_target=self.actor_target) Agent.hard_update(model_local=self.critic_local, model_target=self.critic_target) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.buffer.size() >= self.batch_size: experiences = self.buffer.sample(self.batch_size) self.learn(self.to_tensor(experiences)) def to_tensor(self, experiences): states, actions, rewards, next_states, dones = experiences states = torch.from_numpy(states).float().to(self.device) actions = torch.from_numpy(actions).float().to(self.device) rewards = torch.from_numpy(rewards).float().to(self.device) next_states = torch.from_numpy(next_states).float().to(self.device) dones = torch.from_numpy(dones.astype(np.uint8)).float().to(self.device) return states, actions, rewards, next_states, dones def act(self, states, add_noise=True): states = torch.from_numpy(states).float().to(device=self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Update critic next_actions = self.actor_target(next_states) q_target_next = self.critic_target(next_states, next_actions) q_target = rewards + self.discount * q_target_next * (1.0 - dones) q_local = self.critic_local(states, actions) critic_loss = F.mse_loss(input=q_local, target=q_target) self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_objective = self.critic_local(states, self.actor_local(states)).mean() self.actor_local.zero_grad() (-actor_objective).backward() self.actor_optimizer.step() Agent.soft_update(model_local=self.critic_local, model_target=self.critic_target, tau=self.tau) Agent.soft_update(model_local=self.actor_local, model_target=self.actor_target, tau=self.tau) @staticmethod def soft_update(model_local, model_target, tau): for local_param, target_param in zip(model_local.parameters(), model_target.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) @staticmethod def hard_update(model_local, model_target): Agent.soft_update(model_local=model_local, model_target=model_target, tau=1.0) def reset(self): self.noise.reset()