def __init__(self, algorithm, writer, device, state_dim, action_dim, args, demonstrations_location_args): super(Agent, self).__init__() self.writer = writer self.device = device self.args = args if self.args.on_policy == True: self.data = ReplayBuffer(action_prob_exist=True, max_size=self.args.traj_length, state_dim=state_dim, num_action=action_dim) else: self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) file_size = 120 f = open(demonstrations_location_args.expert_state_location, 'rb') self.expert_states = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])).float() f = open(demonstrations_location_args.expert_action_location, 'rb') self.expert_actions = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])) f = open(demonstrations_location_args.expert_next_state_location, 'rb') self.expert_next_states = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])).float() f = open(demonstrations_location_args.expert_done_location, 'rb') self.expert_dones = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])).float().unsqueeze(-1) f.close() self.brain = algorithm
def __init__(self, writer, device, state_dim, action_dim, args, noise): super(DDPG, self).__init__() self.device = device self.writer = writer self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.soft_update(self.q, self.target_q, 1.) self.soft_update(self.actor, self.target_actor, 1.) self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.noise = noise
def __init__(self, writer, device, state_dim, action_dim, args): super(PPO,self).__init__() self.args = args self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim) self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function,self.args.last_activation,self.args.trainable_std) self.critic = Critic(self.args.layer_num, state_dim, 1, \ self.args.hidden_dim, self.args.activation_function,self.args.last_activation) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr) self.writer = writer self.device = device
def __init__(self, writer, device, state_dim, action_dim, args): super(SAC, self).__init__() self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.soft_update(self.q_1, self.target_q_1, 1.) self.soft_update(self.q_2, self.target_q_2, 1.) self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init)) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.target_entropy = -torch.tensor(action_dim) self.q_1_optimizer = optim.Adam(self.q_1.parameters(), lr=self.args.q_lr) self.q_2_optimizer = optim.Adam(self.q_2.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr) self.device = device self.writer = writer
class SAC(nn.Module): def __init__(self, writer, device, state_dim, action_dim, args): super(SAC, self).__init__() self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.soft_update(self.q_1, self.target_q_1, 1.) self.soft_update(self.q_2, self.target_q_2, 1.) self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init)) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.target_entropy = -torch.tensor(action_dim) self.q_1_optimizer = optim.Adam(self.q_1.parameters(), lr=self.args.q_lr) self.q_2_optimizer = optim.Adam(self.q_2.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr) self.device = device self.writer = writer def put_data(self, transition): self.data.put_data(transition) def soft_update(self, network, target_network, rate): for network_params, target_network_params in zip( network.parameters(), target_network.parameters()): target_network_params.data.copy_(target_network_params.data * (1.0 - rate) + network_params.data * rate) def get_action(self, state): mu, std = self.actor(state) dist = Normal(mu, std) u = dist.rsample() u_log_prob = dist.log_prob(u) a = torch.tanh(u) a_log_prob = u_log_prob - torch.log(1 - torch.square(a) + 1e-3) return a, a_log_prob.sum(-1, keepdim=True) def q_update(self, Q, q_optimizer, states, actions, rewards, next_states, dones): ###target with torch.no_grad(): next_actions, next_action_log_prob = self.get_action(next_states) q_1 = self.target_q_1(next_states, next_actions) q_2 = self.target_q_2(next_states, next_actions) q = torch.min(q_1, q_2) v = (1 - dones) * (q - self.alpha * next_action_log_prob) targets = rewards + self.args.gamma * v q = Q(states, actions) loss = F.smooth_l1_loss(q, targets) q_optimizer.zero_grad() loss.backward() q_optimizer.step() return loss def actor_update(self, states): now_actions, now_action_log_prob = self.get_action(states) q_1 = self.q_1(states, now_actions) q_2 = self.q_2(states, now_actions) q = torch.min(q_1, q_2) loss = (self.alpha.detach() * now_action_log_prob - q).mean() self.actor_optimizer.zero_grad() loss.backward() self.actor_optimizer.step() return loss, now_action_log_prob def alpha_update(self, now_action_log_prob): loss = (-self.alpha * (now_action_log_prob + self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() loss.backward() self.alpha_optimizer.step() return loss def train_net(self, batch_size, n_epi): data = self.data.sample(shuffle=True, batch_size=batch_size) states, actions, rewards, next_states, dones = convert_to_tensor( self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done']) ###q update q_1_loss = self.q_update(self.q_1, self.q_1_optimizer, states, actions, rewards, next_states, dones) q_2_loss = self.q_update(self.q_2, self.q_2_optimizer, states, actions, rewards, next_states, dones) ### actor update actor_loss, prob = self.actor_update(states) ###alpha update alpha_loss = self.alpha_update(prob) self.soft_update(self.q_1, self.target_q_1, self.args.soft_update_rate) self.soft_update(self.q_2, self.target_q_2, self.args.soft_update_rate) if self.writer != None: self.writer.add_scalar("loss/q_1", q_1_loss, n_epi) self.writer.add_scalar("loss/q_2", q_2_loss, n_epi) self.writer.add_scalar("loss/actor", actor_loss, n_epi) self.writer.add_scalar("loss/alpha", alpha_loss, n_epi)
class DDPG(nn.Module): def __init__(self, writer, device, state_dim, action_dim, args, noise): super(DDPG, self).__init__() self.device = device self.writer = writer self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.soft_update(self.q, self.target_q, 1.) self.soft_update(self.actor, self.target_actor, 1.) self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.noise = noise def soft_update(self, network, target_network, rate): for network_params, target_network_params in zip( network.parameters(), target_network.parameters()): target_network_params.data.copy_(target_network_params.data * (1.0 - rate) + network_params.data * rate) def get_action(self, x): return self.actor(x)[0] + torch.tensor(self.noise.sample()).to( self.device), self.actor(x)[1] def put_data(self, transition): self.data.put_data(transition) def train_net(self, batch_size, n_epi): data = self.data.sample(shuffle=True, batch_size=batch_size) states, actions, rewards, next_states, dones = convert_to_tensor( self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done']) targets = rewards + self.args.gamma * (1 - dones) * self.target_q( next_states, self.target_actor(next_states)[0]) q_loss = F.smooth_l1_loss(self.q(states, actions), targets.detach()) self.q_optimizer.zero_grad() q_loss.backward() self.q_optimizer.step() actor_loss = -self.q(states, self.actor(states)[0]).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.q, self.target_q, self.args.soft_update_rate) self.soft_update(self.actor, self.target_actor, self.args.soft_update_rate) if self.writer != None: self.writer.add_scalar("loss/q", q_loss, n_epi) self.writer.add_scalar("loss/actor", actor_loss, n_epi)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epoch', type=int, default=0, help='starting epoch') parser.add_argument('--n_epochs', type=int, default=400, help='number of epochs of training') parser.add_argument('--batchSize', type=int, default=10, help='size of the batches') parser.add_argument('--dataroot', type=str, default='datasets/genderchange/', help='root directory of the dataset') parser.add_argument('--lr', type=float, default=0.0002, help='initial learning rate') parser.add_argument( '--decay_epoch', type=int, default=100, help='epoch to start linearly decaying the learning rate to 0') parser.add_argument('--size', type=int, default=256, help='size of the data crop (squared assumed)') parser.add_argument('--input_nc', type=int, default=3, help='number of channels of input data') parser.add_argument('--output_nc', type=int, default=3, help='number of channels of output data') parser.add_argument('--cuda', action='store_true', help='use GPU computation') parser.add_argument( '--n_cpu', type=int, default=8, help='number of cpu threads to use during batch generation') opt = parser.parse_args() print(opt) if torch.cuda.is_available() and not opt.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) ###### Definition of variables ###### # Networks netG_A2B = Generator(opt.input_nc, opt.output_nc) netG_B2A = Generator(opt.output_nc, opt.input_nc) netD_A = Discriminator(opt.input_nc) netD_B = Discriminator(opt.output_nc) if opt.cuda: netG_A2B.cuda() netG_B2A.cuda() netD_A.cuda() netD_B.cuda() netG_A2B.apply(weights_init_normal) netG_B2A.apply(weights_init_normal) netD_A.apply(weights_init_normal) netD_B.apply(weights_init_normal) # Lossess criterion_GAN = torch.nn.MSELoss() criterion_cycle = torch.nn.L1Loss() criterion_identity = torch.nn.L1Loss() # Optimizers & LR schedulers optimizer_G = torch.optim.Adam(itertools.chain(netG_A2B.parameters(), netG_B2A.parameters()), lr=opt.lr, betas=(0.5, 0.999)) optimizer_D_A = torch.optim.Adam(netD_A.parameters(), lr=opt.lr, betas=(0.5, 0.999)) optimizer_D_B = torch.optim.Adam(netD_B.parameters(), lr=opt.lr, betas=(0.5, 0.999)) lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR( optimizer_G, lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step) lr_scheduler_D_A = torch.optim.lr_scheduler.LambdaLR( optimizer_D_A, lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step) lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR( optimizer_D_B, lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step) # Inputs & targets memory allocation Tensor = torch.cuda.FloatTensor if opt.cuda else torch.Tensor input_A = Tensor(opt.batchSize, opt.input_nc, opt.size, opt.size) input_B = Tensor(opt.batchSize, opt.output_nc, opt.size, opt.size) target_real = Variable(Tensor(opt.batchSize).fill_(1.0), requires_grad=False) target_fake = Variable(Tensor(opt.batchSize).fill_(0.0), requires_grad=False) fake_A_buffer = ReplayBuffer() fake_B_buffer = ReplayBuffer() # Dataset loader transforms_ = [ transforms.Resize(int(opt.size * 1.2), Image.BICUBIC), transforms.CenterCrop(opt.size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ] dataloader = DataLoader(ImageDataset(opt.dataroot, transforms_=transforms_, unaligned=True), batch_size=opt.batchSize, shuffle=True, num_workers=opt.n_cpu, drop_last=True) # Plot Loss and Images in Tensorboard experiment_dir = 'logs/{}@{}'.format( opt.dataroot.split('/')[1], datetime.now().strftime("%d.%m.%Y-%H:%M:%S")) os.makedirs(experiment_dir, exist_ok=True) writer = SummaryWriter(os.path.join(experiment_dir, "tb")) metric_dict = defaultdict(list) n_iters_total = 0 ################################### ###### Training ###### for epoch in range(opt.epoch, opt.n_epochs): for i, batch in enumerate(dataloader): # Set model input real_A = Variable(input_A.copy_(batch['A'])) real_B = Variable(input_B.copy_(batch['B'])) ###### Generators A2B and B2A ###### optimizer_G.zero_grad() # Identity loss # G_A2B(B) should equal B if real B is fed same_B = netG_A2B(real_B) loss_identity_B = criterion_identity( same_B, real_B) * 5.0 # [batchSize, 3, ImgSize, ImgSize] # G_B2A(A) should equal A if real A is fed same_A = netG_B2A(real_A) loss_identity_A = criterion_identity( same_A, real_A) * 5.0 # [batchSize, 3, ImgSize, ImgSize] # GAN loss fake_B = netG_A2B(real_A) pred_fake = netD_B(fake_B).view(-1) loss_GAN_A2B = criterion_GAN(pred_fake, target_real) # [batchSize] fake_A = netG_B2A(real_B) pred_fake = netD_A(fake_A).view(-1) loss_GAN_B2A = criterion_GAN(pred_fake, target_real) # [batchSize] # Cycle loss recovered_A = netG_B2A(fake_B) loss_cycle_ABA = criterion_cycle( recovered_A, real_A) * 10.0 # [batchSize, 3, ImgSize, ImgSize] recovered_B = netG_A2B(fake_A) loss_cycle_BAB = criterion_cycle( recovered_B, real_B) * 10.0 # [batchSize, 3, ImgSize, ImgSize] # Total loss loss_G = loss_identity_A + loss_identity_B + loss_GAN_A2B + loss_GAN_B2A + loss_cycle_ABA + loss_cycle_BAB loss_G.backward() optimizer_G.step() ################################### ###### Discriminator A ###### optimizer_D_A.zero_grad() # Real loss pred_real = netD_A(real_A).view(-1) loss_D_real = criterion_GAN(pred_real, target_real) # [batchSize] # Fake loss fake_A = fake_A_buffer.push_and_pop(fake_A) pred_fake = netD_A(fake_A.detach()).view(-1) loss_D_fake = criterion_GAN(pred_fake, target_fake) # [batchSize] # Total loss loss_D_A = (loss_D_real + loss_D_fake) * 0.5 loss_D_A.backward() optimizer_D_A.step() ################################### ###### Discriminator B ###### optimizer_D_B.zero_grad() # Real loss pred_real = netD_B(real_B).view(-1) loss_D_real = criterion_GAN(pred_real, target_real) # [batchSize] # Fake loss fake_B = fake_B_buffer.push_and_pop(fake_B) pred_fake = netD_B(fake_B.detach()).view(-1) loss_D_fake = criterion_GAN(pred_fake, target_fake) # [batchSize] # Total loss loss_D_B = (loss_D_real + loss_D_fake) * 0.5 loss_D_B.backward() optimizer_D_B.step() ################################### metric_dict['loss_G'].append(loss_G.item()) metric_dict['loss_G_identity'].append(loss_identity_A.item() + loss_identity_B.item()) metric_dict['loss_G_GAN'].append(loss_GAN_A2B.item() + loss_GAN_B2A.item()) metric_dict['loss_G_cycle'].append(loss_cycle_ABA.item() + loss_cycle_BAB.item()) metric_dict['loss_D'].append(loss_D_A.item() + loss_D_B.item()) for title, value in metric_dict.items(): writer.add_scalar('train/{}'.format(title), value[-1], n_iters_total) n_iters_total += 1 print(""" ----------------------------------------------------------- Epoch : {} Finished Loss_G : {} Loss_G_identity : {} Loss_G_GAN : {} Loss_G_cycle : {} Loss_D : {} ----------------------------------------------------------- """.format(epoch, loss_G, loss_identity_A + loss_identity_B, loss_GAN_A2B + loss_GAN_B2A, loss_cycle_ABA + loss_cycle_BAB, loss_D_A + loss_D_B)) # Update learning rates lr_scheduler_G.step() lr_scheduler_D_A.step() lr_scheduler_D_B.step() # Save models checkpoints if loss_G.item() < 2.5: os.makedirs(os.path.join(experiment_dir, str(epoch)), exist_ok=True) torch.save(netG_A2B.state_dict(), '{}/{}/netG_A2B.pth'.format(experiment_dir, epoch)) torch.save(netG_B2A.state_dict(), '{}/{}/netG_B2A.pth'.format(experiment_dir, epoch)) torch.save(netD_A.state_dict(), '{}/{}/netD_A.pth'.format(experiment_dir, epoch)) torch.save(netD_B.state_dict(), '{}/{}/netD_B.pth'.format(experiment_dir, epoch)) elif epoch > 100 and epoch % 40 == 0: os.makedirs(os.path.join(experiment_dir, str(epoch)), exist_ok=True) torch.save(netG_A2B.state_dict(), '{}/{}/netG_A2B.pth'.format(experiment_dir, epoch)) torch.save(netG_B2A.state_dict(), '{}/{}/netG_B2A.pth'.format(experiment_dir, epoch)) torch.save(netD_A.state_dict(), '{}/{}/netD_A.pth'.format(experiment_dir, epoch)) torch.save(netD_B.state_dict(), '{}/{}/netD_B.pth'.format(experiment_dir, epoch)) for title, value in metric_dict.items(): writer.add_scalar("train/{}_epoch".format(title), np.mean(value), epoch)
class Agent(nn.Module): def __init__(self, algorithm, writer, device, state_dim, action_dim, args, demonstrations_location_args): super(Agent, self).__init__() self.writer = writer self.device = device self.args = args if self.args.on_policy == True: self.data = ReplayBuffer(action_prob_exist=True, max_size=self.args.traj_length, state_dim=state_dim, num_action=action_dim) else: self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) file_size = 120 f = open(demonstrations_location_args.expert_state_location, 'rb') self.expert_states = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])).float() f = open(demonstrations_location_args.expert_action_location, 'rb') self.expert_actions = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])) f = open(demonstrations_location_args.expert_next_state_location, 'rb') self.expert_next_states = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])).float() f = open(demonstrations_location_args.expert_done_location, 'rb') self.expert_dones = torch.tensor( np.concatenate([np.load(f) for _ in range(file_size)])).float().unsqueeze(-1) f.close() self.brain = algorithm def get_action(self, x): action, log_prob = self.brain.get_action(x) return action, log_prob def put_data(self, transition): self.data.put_data(transition) def train(self, discriminator, discriminator_batch_size, state_rms, n_epi, batch_size=64): if self.args.on_policy: data = self.data.sample(shuffle=False) states, actions, rewards, next_states, done_masks, old_log_probs = convert_to_tensor( self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done'], data['log_prob']) else: data = self.data.sample(shuffle=True, batch_size=discriminator_batch_size) states, actions, rewards, next_states, done_masks = convert_to_tensor( self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done']) if discriminator.name() == 'sqil': agent_s, agent_a, agent_next_s, agent_done_mask = make_one_mini_batch( batch_size, states, actions, next_states, done_masks) expert_s, expert_a, expert_next_s, expert_done = make_one_mini_batch( batch_size, self.expert_states, self.expert_actions, self.expert_next_states, self.expert_dones) expert_done_mask = (1 - expert_done.float()) discriminator.train_network(self.brain, n_epi, agent_s, agent_a, agent_next_s, agent_done_mask, expert_s, expert_a, expert_next_s, expert_done_mask) return if discriminator.args.is_airl == False: agent_s, agent_a = make_one_mini_batch(discriminator_batch_size, states, actions) expert_s, expert_a = make_one_mini_batch(discriminator_batch_size, self.expert_states, self.expert_actions) if self.args.on_policy: expert_s = np.clip( (expert_s - state_rms.mean) / (state_rms.var**0.5 + 1e-8), -5, 5) discriminator.train_network(self.writer, n_epi, agent_s, agent_a, expert_s, expert_a) else: agent_s, agent_a, agent_next_s, agent_done_mask = make_one_mini_batch( discriminator_batch_size, states, actions, next_states, done_masks) expert_s, expert_a, expert_next_s, expert_done = make_one_mini_batch( discriminator_batch_size, self.expert_states, self.expert_actions, self.expert_next_states, self.expert_dones) expert_done_mask = (1 - expert_done.float()) if self.args.on_policy: expert_s = np.clip( (expert_s - state_rms.mean) / (state_rms.var**0.5 + 1e-8), -5, 5).float() expert_next_s = np.clip((expert_next_s - state_rms.mean) / (state_rms.var**0.5 + 1e-8), -5, 5).float() mu, sigma = self.brain.get_dist(agent_s.float().to(self.device)) dist = torch.distributions.Normal(mu, sigma) agent_log_prob = dist.log_prob(agent_a).sum(-1, keepdim=True).detach() mu, sigma = self.brain.get_dist(expert_s.float().to(self.device)) dist = torch.distributions.Normal(mu, sigma) expert_log_prob = dist.log_prob(expert_a).sum( -1, keepdim=True).detach() discriminator.train_network(self.writer, n_epi, agent_s, agent_a, agent_next_s,\ agent_log_prob, agent_done_mask, expert_s, expert_a, expert_next_s, expert_log_prob, expert_done_mask) if self.args.on_policy: self.brain.train_network(self.writer, n_epi, states, actions, rewards, next_states, done_masks, old_log_probs) else: data = self.data.sample(shuffle=True, batch_size=batch_size) states, actions, rewards, next_states, done_masks = convert_to_tensor( self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done']) self.brain.train_network(self.writer, n_epi, states, actions, rewards, next_states, done_masks)
def train(opt, train_loader, netG, netD): epoch = 0 n_epochs = opt.epochs decay_epoch = opt.decay_epoch batchSize = opt.b size = 128 input_nc = opt.input_channel output_nc = 3 lr = opt.lr if opt.stage != "Refine": nRow = 3 else: nRow = 4 criterion_GAN = torch.nn.MSELoss() criterion_identity = torch.nn.L1Loss() optimizer_G = torch.optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.999)) optimizer_D = torch.optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.999)) lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(optimizer_G, lr_lambda=LambdaLR( n_epochs, epoch, decay_epoch).step) lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR(optimizer_D, lr_lambda=LambdaLR( n_epochs, epoch, decay_epoch).step) # Inputs & targets memory allocation Tensor = torch.cuda.FloatTensor input_A = Tensor(batchSize, input_nc, size, size) target_real = Variable(Tensor(batchSize).fill_(1.0), requires_grad=False) target_fake = Variable(Tensor(batchSize).fill_(0.0), requires_grad=False) fake_buffer = ReplayBuffer() for epoch in range(0, n_epochs): gc.collect() Source = iter(train_loader) avg_loss_g = 0 avg_loss_d = 0 for i in range(0, len(train_loader)): netG.train() target_real = Variable(torch.ones(1, 1), requires_grad=False).cuda() target_fake = Variable(torch.zeros(1, 1), requires_grad=False).cuda() optimizer_G.zero_grad() if opt.stage != "Refine": src, mask, style_img, target, gt_cloth, skel, cloth = Source.next( ) src, mask, style_img, target, gt_cloth, skel, cloth = Variable( src.cuda()), Variable(mask.cuda()), Variable( style_img.cuda()), Variable(target.cuda()), Variable( gt_cloth.cuda()), Variable(skel.cuda()), Variable( cloth.cuda()) else: src, mask, style_img, target, gt_cloth, wrap, diff, cloth = Source.next( ) src, mask, style_img, target, gt_cloth, wrap, diff, cloth = Variable( src.cuda()), Variable(mask.cuda()), Variable( style_img.cuda()), Variable(target.cuda()), Variable( gt_cloth.cuda()), Variable(wrap.cuda()), Variable( diff.cuda()), Variable(cloth.cuda()) #Inverse identity if opt.stage == "Shape": gen_targ, _, _, _, _, _, _ = netG(skel, cloth) # src,conditions elif opt.stage == "Stitch": gen_targ, _, _, _, _, _, _ = netG(src, style_img, skel) elif opt.stage == "Refine": gen_targ, _, _, _, _, _, _ = netG(diff, wrap) pred_fake = netD(gen_targ) if opt.stage == "Shape": loss_GAN = 10 * criterion_GAN( pred_fake, target_real) + 10 * criterion_identity( gen_targ, gt_cloth) elif opt.stage == "Stitch" or opt.stage == "Refine": loss_GAN = 10 * criterion_GAN( pred_fake, target_real) + 10 * criterion_identity( gen_targ, target) loss_G = loss_GAN loss_G.backward() optimizer_G.step() ############################################# optimizer_D.zero_grad() if opt.stage == "Shape": pred_real = netD(gt_cloth) elif opt.stage == "Stitch" or opt.stage == "Refine": pred_real = netD(target) loss_D_real = criterion_GAN(pred_real, target_real) # Fake loss gen_targ = fake_buffer.push_and_pop(gen_targ) pred_fake = netD(gen_targ.detach()) loss_D_fake = criterion_GAN(pred_fake, target_fake) # Total loss loss_D = (loss_D_real + loss_D_fake) * 0.5 loss_D.backward() if (i + 1) % opt.critic == 0: optimizer_D.step() avg_loss_g = (avg_loss_g + loss_G) / (i + 1) avg_loss_d = (avg_loss_d + loss_D) / (i + 1) if (i + 1) % 100 == 0: print("Epoch: (%3d) (%5d/%5d) Loss: (%0.0003f) (%0.0003f)" % (epoch, i + 1, len(train_loader), avg_loss_g * 1000, avg_loss_d * 1000)) if (i + 1) % opt.display_count == 0: if opt.stage == "Shape": pic = (torch.cat( [style_img, gen_targ, cloth, skel, target, gt_cloth], dim=0).data + 1) / 2.0 elif opt.stage == "Stitch": pic = (torch.cat( [src, gen_targ, cloth, skel, target, gt_cloth], dim=0).data + 1) / 2.0 elif opt.stage == "Refine": pic = (torch.cat([wrap, diff, gen_targ, target], dim=0).data + 1) / 2.0 save_dir = "{}/{}".format(os.getcwd(), opt.results) # os.mkdir(save_dir) save_image(pic, '%s/Epoch_(%d)_(%dof%d).jpg' % (save_dir, epoch, i + 1, len(train_loader)), nrow=nRow) if (epoch + 1) % opt.save_model == 0: save_dir = "{}/{}".format(os.getcwd(), opt.results) torch.save(netG.state_dict(), '{}/Gan_{}.pth'.format(save_dir, epoch)) # Update learning rates lr_scheduler_G.step() lr_scheduler_D.step()
class PPO(nn.Module): def __init__(self, writer, device, state_dim, action_dim, args): super(PPO,self).__init__() self.args = args self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim) self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function,self.args.last_activation,self.args.trainable_std) self.critic = Critic(self.args.layer_num, state_dim, 1, \ self.args.hidden_dim, self.args.activation_function,self.args.last_activation) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr) self.writer = writer self.device = device def get_action(self,x): mu,sigma = self.actor(x) return mu,sigma def v(self,x): return self.critic(x) def put_data(self,transition): self.data.put_data(transition) def get_gae(self, states, rewards, next_states, dones): values = self.v(states).detach() td_target = rewards + self.args.gamma * self.v(next_states) * (1 - dones) delta = td_target - values delta = delta.detach().cpu().numpy() advantage_lst = [] advantage = 0.0 for idx in reversed(range(len(delta))): if dones[idx] == 1: advantage = 0.0 advantage = self.args.gamma * self.args.lambda_ * advantage + delta[idx][0] advantage_lst.append([advantage]) advantage_lst.reverse() advantages = torch.tensor(advantage_lst, dtype=torch.float).to(self.device) return values, advantages def train_net(self,n_epi): data = self.data.sample(shuffle = False) states, actions, rewards, next_states, dones, old_log_probs = convert_to_tensor(self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done'], data['log_prob']) old_values, advantages = self.get_gae(states, rewards, next_states, dones) returns = advantages + old_values advantages = (advantages - advantages.mean())/(advantages.std()+1e-3) for i in range(self.args.train_epoch): for state,action,old_log_prob,advantage,return_,old_value \ in make_mini_batch(self.args.batch_size, states, actions, \ old_log_probs,advantages,returns,old_values): curr_mu,curr_sigma = self.get_action(state) value = self.v(state).float() curr_dist = torch.distributions.Normal(curr_mu,curr_sigma) entropy = curr_dist.entropy() * self.args.entropy_coef curr_log_prob = curr_dist.log_prob(action).sum(1,keepdim = True) #policy clipping ratio = torch.exp(curr_log_prob - old_log_prob.detach()) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1-self.args.max_clip, 1+self.args.max_clip) * advantage actor_loss = (-torch.min(surr1, surr2) - entropy).mean() #value clipping (PPO2 technic) old_value_clipped = old_value + (value - old_value).clamp(-self.args.max_clip,self.args.max_clip) value_loss = (value - return_.detach().float()).pow(2) value_loss_clipped = (old_value_clipped - return_.detach().float()).pow(2) critic_loss = 0.5 * self.args.critic_coef * torch.max(value_loss,value_loss_clipped).mean() self.actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.args.max_grad_norm) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.args.max_grad_norm) self.critic_optimizer.step() if self.writer != None: self.writer.add_scalar("loss/actor_loss", actor_loss.item(), n_epi) self.writer.add_scalar("loss/critic_loss", critic_loss.item(), n_epi)
def train(**kwargs): opt = Config() opt._parse(kwargs) transform = tf.Compose([ tf.Resize(int(1.12 * opt.image_size), Image.BICUBIC), tf.RandomCrop(opt.image_size), tf.RandomHorizontalFlip(), tf.ToTensor(), tf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) ''' Image.NEAREST :低质量 Image.BILINEAR:双线性 Image.BICUBIC :三次样条插值 Image.ANTIALIAS:高质量 ''' # 读取数据 trian_data = ImageDataset(opt.dataroot, transforms=transform, istrain=True) train_loader = DataLoader(trian_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers) # 实例化网络 G_A2B = CycleGan.generator() G_B2A = CycleGan.generator() D_A = CycleGan.discriminator() D_B = CycleGan.discriminator() if t.cuda.is_available(): G_A2B.cuda() G_B2A.cuda() D_A.cuda() D_B.cuda() # 初始化网络 G_A2B.weight_init() G_B2A.weight_init() D_A.weight_init() D_B.weight_init() # 定义loss criterion_GAN = t.nn.MSELoss() criterion_Cycle = t.nn.L1Loss() criterion_identity = t.nn.L1Loss() # 定义优化器 optimizer_G = t.optim.Adam(itertools.chain(G_A2B.parameters(), G_B2A.parameters()), lr=opt.lr, betas=(opt.betas, 0.999)) optimizer_D = t.optim.Adam(itertools.chain(D_A.parameters(), D_B.parameters()), lr=opt.lr, betas=(opt.betas, 0.999)) # 定义动态改变学习率 lr_schedule_G = t.optim.lr_scheduler.LambdaLR(optimizer_G, lr_lambda=LambdaLR( opt.max_epoch, 0, opt.decay_epoch).step) lr_schedule_D = t.optim.lr_scheduler.LambdaLR(optimizer_D, lr_lambda=LambdaLR( opt.max_epoch, 0, opt.decay_epoch).step) # 输入输出,标签 Tensor = t.cuda.FloatTensor if t.cuda.is_available() else t.Tensor input_A = Tensor(opt.batch_size, 3, opt.image_size, opt.image_size) input_B = Tensor(opt.batch_size, 3, opt.image_size, opt.image_size) target_real = t.ones(opt.batch_size, 1).cuda() target_fake = t.zeros(opt.batch_size, 1).cuda() fake_A_buffer = ReplayBuffer() fake_B_buffer = ReplayBuffer() # 定义可视化visdom vis = Visualizer(env=opt.env, port=15024) # 定义averagemeter lossG_A2B_meter = meter.AverageValueMeter() lossG_B2A_meter = meter.AverageValueMeter() lossG_identity_meter = meter.AverageValueMeter() lossG_cycle_meter = meter.AverageValueMeter() lossD_B_meter = meter.AverageValueMeter() lossD_A_meter = meter.AverageValueMeter() # 开始训练 lam = 10 for epoch in range(opt.max_epoch): lossD_A_meter.reset() lossD_B_meter.reset() lossG_cycle_meter.reset() lossG_identity_meter.reset() lossG_B2A_meter.reset() lossG_A2B_meter.reset() for i, batch in tqdm.tqdm(enumerate(train_loader)): real_A = input_A.copy_(batch['A']).cuda() real_B = input_B.copy_(batch['B']).cuda() # print(real_A.requires_grad) # 训练生成器 # 生成器A2b,生成器B2A optimizer_G.zero_grad() # identity loss # G_A2B(B)=B if B is real same_B = G_A2B(real_B) loss_identity_B = criterion_identity(same_B, real_B) * 0.5 * lam # the same as above same_A = G_B2A(real_A) loss_identity_A = criterion_identity(same_A, real_A) * 0.5 * lam lossG_identity_meter.add(loss_identity_A.item() + loss_identity_B.item()) # GAN loss fake_B = G_A2B(real_A) prob_fakeB = D_B(fake_B) loss_GAN_A2B = criterion_GAN(prob_fakeB, target_real) lossG_A2B_meter.add(loss_GAN_A2B.item()) fake_A = G_B2A(real_B) prob_fakeA = D_A(fake_A) loss_GAN_B2A = criterion_GAN(prob_fakeA, target_real) lossG_B2A_meter.add(loss_GAN_B2A.item()) # Cycle loss recoverA = G_B2A(fake_B) loss_cycle_ABA = criterion_Cycle(recoverA, real_A) * lam recoverB = G_A2B(fake_A) loss_cycle_BAB = criterion_Cycle(recoverB, real_B) * lam lossG_cycle_meter.add(loss_cycle_BAB.item() + loss_cycle_ABA.item()) # total loss loss_G = loss_identity_A + loss_identity_B + loss_GAN_A2B + loss_GAN_B2A + loss_cycle_ABA + loss_cycle_BAB loss_G.backward() optimizer_G.step() # 训练判别器 optimizer_D.zero_grad() # real loss pred_real_B = D_B(real_B) loss_D_real_B = criterion_GAN(pred_real_B, target_real) # fake loss ,fake from buffer fake_B_new = fake_B_buffer.push_and_pop(fake_B) pred_fake_B = D_B(fake_B_new) loss_D_fake_B = criterion_GAN(pred_fake_B, target_fake) loss_total_B = (loss_D_real_B + loss_D_fake_B) * 0.5 lossD_B_meter.add(loss_total_B.item()) loss_total_B.backward() # real loss pred_real_A = D_A(real_A) loss_D_real_A = criterion_GAN(pred_real_A, target_real) # fakr loss ,fake from buffer fake_A_new = fake_A_buffer.push_and_pop(fake_A) pred_fake_A = D_A(fake_A_new) loss_D_fake_A = criterion_GAN(pred_fake_A, target_fake) loss_total_A = (loss_D_fake_A + loss_D_real_A) * 0.5 lossD_A_meter.add(loss_total_A.item()) loss_total_A.backward() optimizer_D.step() ###打印可视化 if (i + 1) % opt.plot_every == 0: vis.plot('lossG_A2B', lossG_A2B_meter.value()[0]) vis.plot('lossG_B2A', lossG_B2A_meter.value()[0]) vis.plot('lossG_identity', lossG_identity_meter.value()[0]) vis.plot('lossG_cycle', lossG_cycle_meter.value()[0]) vis.plot('lossD_B', lossD_B_meter.value()[0]) vis.plot('lossD_A', lossD_A_meter.value()[0]) vis.img('real_A', real_A.data.cpu()[0] * 0.5 + 0.5) vis.img('fake_B', fake_B.data.cpu()[0] * 0.5 + 0.5) vis.img('real_B', real_B.data.cpu()[0] * 0.5 + 0.5) vis.img('fake_A', fake_A.data.cpu()[0] * 0.5 + 0.5) # 更新学习率 lr_schedule_G.step() lr_schedule_D.step() # 保存模型m if (epoch + 1) % opt.savemode_every == 0: t.save( G_A2B.state_dict(), 'checkpoints/%s_%s_G_A2B.pth' % (epoch, time.strftime('%m%d_%H:%M%S'))) t.save( G_B2A.state_dict(), 'checkpoints/%s_%s_G_B2A.pth' % (epoch, time.strftime('%m%d_%H:%M%S'))) t.save( D_A.state_dict(), 'checkpoints/%s_%s_D_A.pth' % (epoch, time.strftime('%m%d_%H:%M%S'))) t.save( D_B.state_dict(), 'checkpoints/%s_%s_D_B.pth' % (epoch, time.strftime('%m%d_%H:%M%S')))