Example #1
0
    def __init__(self, algorithm, writer, device, state_dim, action_dim, args,
                 demonstrations_location_args):
        super(Agent, self).__init__()
        self.writer = writer
        self.device = device
        self.args = args
        if self.args.on_policy == True:
            self.data = ReplayBuffer(action_prob_exist=True,
                                     max_size=self.args.traj_length,
                                     state_dim=state_dim,
                                     num_action=action_dim)
        else:
            self.data = ReplayBuffer(action_prob_exist=False,
                                     max_size=int(self.args.memory_size),
                                     state_dim=state_dim,
                                     num_action=action_dim)
        file_size = 120

        f = open(demonstrations_location_args.expert_state_location, 'rb')
        self.expert_states = torch.tensor(
            np.concatenate([np.load(f) for _ in range(file_size)])).float()
        f = open(demonstrations_location_args.expert_action_location, 'rb')
        self.expert_actions = torch.tensor(
            np.concatenate([np.load(f) for _ in range(file_size)]))
        f = open(demonstrations_location_args.expert_next_state_location, 'rb')
        self.expert_next_states = torch.tensor(
            np.concatenate([np.load(f) for _ in range(file_size)])).float()
        f = open(demonstrations_location_args.expert_done_location, 'rb')
        self.expert_dones = torch.tensor(
            np.concatenate([np.load(f)
                            for _ in range(file_size)])).float().unsqueeze(-1)
        f.close()

        self.brain = algorithm
Example #2
0
    def __init__(self, writer, device, state_dim, action_dim, args, noise):
        super(DDPG, self).__init__()
        self.device = device
        self.writer = writer

        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                        self.args.hidden_dim, self.args.activation_function,
                        None)

        self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                               self.args.hidden_dim,
                               self.args.activation_function, None)

        self.soft_update(self.q, self.target_q, 1.)
        self.soft_update(self.actor, self.target_actor, 1.)

        self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)

        self.noise = noise
Example #3
0
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(PPO,self).__init__()
        self.args = args
        
        self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim)
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function,self.args.last_activation,self.args.trainable_std)
        self.critic = Critic(self.args.layer_num, state_dim, 1, \
                             self.args.hidden_dim, self.args.activation_function,self.args.last_activation)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr)

        self.writer = writer
        self.device = device
Example #4
0
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(SAC, self).__init__()
        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)
        self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)

        self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)
        self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)

        self.soft_update(self.q_1, self.target_q_1, 1.)
        self.soft_update(self.q_2, self.target_q_2, 1.)

        self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init))

        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)
        self.target_entropy = -torch.tensor(action_dim)

        self.q_1_optimizer = optim.Adam(self.q_1.parameters(),
                                        lr=self.args.q_lr)
        self.q_2_optimizer = optim.Adam(self.q_2.parameters(),
                                        lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr)

        self.device = device
        self.writer = writer
Example #5
0
class SAC(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(SAC, self).__init__()
        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)
        self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1,
                          self.args.hidden_dim, self.args.activation_function,
                          self.args.last_activation)

        self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)
        self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim,
                                 1, self.args.hidden_dim,
                                 self.args.activation_function,
                                 self.args.last_activation)

        self.soft_update(self.q_1, self.target_q_1, 1.)
        self.soft_update(self.q_2, self.target_q_2, 1.)

        self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init))

        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)
        self.target_entropy = -torch.tensor(action_dim)

        self.q_1_optimizer = optim.Adam(self.q_1.parameters(),
                                        lr=self.args.q_lr)
        self.q_2_optimizer = optim.Adam(self.q_2.parameters(),
                                        lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr)

        self.device = device
        self.writer = writer

    def put_data(self, transition):
        self.data.put_data(transition)

    def soft_update(self, network, target_network, rate):
        for network_params, target_network_params in zip(
                network.parameters(), target_network.parameters()):
            target_network_params.data.copy_(target_network_params.data *
                                             (1.0 - rate) +
                                             network_params.data * rate)

    def get_action(self, state):
        mu, std = self.actor(state)
        dist = Normal(mu, std)
        u = dist.rsample()
        u_log_prob = dist.log_prob(u)
        a = torch.tanh(u)
        a_log_prob = u_log_prob - torch.log(1 - torch.square(a) + 1e-3)
        return a, a_log_prob.sum(-1, keepdim=True)

    def q_update(self, Q, q_optimizer, states, actions, rewards, next_states,
                 dones):
        ###target
        with torch.no_grad():
            next_actions, next_action_log_prob = self.get_action(next_states)
            q_1 = self.target_q_1(next_states, next_actions)
            q_2 = self.target_q_2(next_states, next_actions)
            q = torch.min(q_1, q_2)
            v = (1 - dones) * (q - self.alpha * next_action_log_prob)
            targets = rewards + self.args.gamma * v

        q = Q(states, actions)
        loss = F.smooth_l1_loss(q, targets)
        q_optimizer.zero_grad()
        loss.backward()
        q_optimizer.step()
        return loss

    def actor_update(self, states):
        now_actions, now_action_log_prob = self.get_action(states)
        q_1 = self.q_1(states, now_actions)
        q_2 = self.q_2(states, now_actions)
        q = torch.min(q_1, q_2)

        loss = (self.alpha.detach() * now_action_log_prob - q).mean()
        self.actor_optimizer.zero_grad()
        loss.backward()
        self.actor_optimizer.step()
        return loss, now_action_log_prob

    def alpha_update(self, now_action_log_prob):
        loss = (-self.alpha *
                (now_action_log_prob + self.target_entropy).detach()).mean()
        self.alpha_optimizer.zero_grad()
        loss.backward()
        self.alpha_optimizer.step()
        return loss

    def train_net(self, batch_size, n_epi):
        data = self.data.sample(shuffle=True, batch_size=batch_size)
        states, actions, rewards, next_states, dones = convert_to_tensor(
            self.device, data['state'], data['action'], data['reward'],
            data['next_state'], data['done'])

        ###q update
        q_1_loss = self.q_update(self.q_1, self.q_1_optimizer, states, actions,
                                 rewards, next_states, dones)
        q_2_loss = self.q_update(self.q_2, self.q_2_optimizer, states, actions,
                                 rewards, next_states, dones)

        ### actor update
        actor_loss, prob = self.actor_update(states)

        ###alpha update
        alpha_loss = self.alpha_update(prob)

        self.soft_update(self.q_1, self.target_q_1, self.args.soft_update_rate)
        self.soft_update(self.q_2, self.target_q_2, self.args.soft_update_rate)
        if self.writer != None:
            self.writer.add_scalar("loss/q_1", q_1_loss, n_epi)
            self.writer.add_scalar("loss/q_2", q_2_loss, n_epi)
            self.writer.add_scalar("loss/actor", actor_loss, n_epi)
            self.writer.add_scalar("loss/alpha", alpha_loss, n_epi)
Example #6
0
class DDPG(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args, noise):
        super(DDPG, self).__init__()
        self.device = device
        self.writer = writer

        self.args = args
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function, self.args.last_activation, self.args.trainable_std)

        self.q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                        self.args.hidden_dim, self.args.activation_function,
                        None)

        self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1,
                               self.args.hidden_dim,
                               self.args.activation_function, None)

        self.soft_update(self.q, self.target_q, 1.)
        self.soft_update(self.actor, self.target_actor, 1.)

        self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.args.actor_lr)
        self.data = ReplayBuffer(action_prob_exist=False,
                                 max_size=int(self.args.memory_size),
                                 state_dim=state_dim,
                                 num_action=action_dim)

        self.noise = noise

    def soft_update(self, network, target_network, rate):
        for network_params, target_network_params in zip(
                network.parameters(), target_network.parameters()):
            target_network_params.data.copy_(target_network_params.data *
                                             (1.0 - rate) +
                                             network_params.data * rate)

    def get_action(self, x):
        return self.actor(x)[0] + torch.tensor(self.noise.sample()).to(
            self.device), self.actor(x)[1]

    def put_data(self, transition):
        self.data.put_data(transition)

    def train_net(self, batch_size, n_epi):
        data = self.data.sample(shuffle=True, batch_size=batch_size)
        states, actions, rewards, next_states, dones = convert_to_tensor(
            self.device, data['state'], data['action'], data['reward'],
            data['next_state'], data['done'])

        targets = rewards + self.args.gamma * (1 - dones) * self.target_q(
            next_states,
            self.target_actor(next_states)[0])
        q_loss = F.smooth_l1_loss(self.q(states, actions), targets.detach())
        self.q_optimizer.zero_grad()
        q_loss.backward()
        self.q_optimizer.step()

        actor_loss = -self.q(states, self.actor(states)[0]).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.q, self.target_q, self.args.soft_update_rate)
        self.soft_update(self.actor, self.target_actor,
                         self.args.soft_update_rate)
        if self.writer != None:
            self.writer.add_scalar("loss/q", q_loss, n_epi)
            self.writer.add_scalar("loss/actor", actor_loss, n_epi)
Example #7
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--epoch', type=int, default=0, help='starting epoch')
    parser.add_argument('--n_epochs',
                        type=int,
                        default=400,
                        help='number of epochs of training')
    parser.add_argument('--batchSize',
                        type=int,
                        default=10,
                        help='size of the batches')
    parser.add_argument('--dataroot',
                        type=str,
                        default='datasets/genderchange/',
                        help='root directory of the dataset')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0002,
                        help='initial learning rate')
    parser.add_argument(
        '--decay_epoch',
        type=int,
        default=100,
        help='epoch to start linearly decaying the learning rate to 0')
    parser.add_argument('--size',
                        type=int,
                        default=256,
                        help='size of the data crop (squared assumed)')
    parser.add_argument('--input_nc',
                        type=int,
                        default=3,
                        help='number of channels of input data')
    parser.add_argument('--output_nc',
                        type=int,
                        default=3,
                        help='number of channels of output data')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use GPU computation')
    parser.add_argument(
        '--n_cpu',
        type=int,
        default=8,
        help='number of cpu threads to use during batch generation')
    opt = parser.parse_args()
    print(opt)

    if torch.cuda.is_available() and not opt.cuda:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )

    ###### Definition of variables ######
    # Networks
    netG_A2B = Generator(opt.input_nc, opt.output_nc)
    netG_B2A = Generator(opt.output_nc, opt.input_nc)
    netD_A = Discriminator(opt.input_nc)
    netD_B = Discriminator(opt.output_nc)

    if opt.cuda:
        netG_A2B.cuda()
        netG_B2A.cuda()
        netD_A.cuda()
        netD_B.cuda()

    netG_A2B.apply(weights_init_normal)
    netG_B2A.apply(weights_init_normal)
    netD_A.apply(weights_init_normal)
    netD_B.apply(weights_init_normal)

    # Lossess
    criterion_GAN = torch.nn.MSELoss()
    criterion_cycle = torch.nn.L1Loss()
    criterion_identity = torch.nn.L1Loss()

    # Optimizers & LR schedulers
    optimizer_G = torch.optim.Adam(itertools.chain(netG_A2B.parameters(),
                                                   netG_B2A.parameters()),
                                   lr=opt.lr,
                                   betas=(0.5, 0.999))
    optimizer_D_A = torch.optim.Adam(netD_A.parameters(),
                                     lr=opt.lr,
                                     betas=(0.5, 0.999))
    optimizer_D_B = torch.optim.Adam(netD_B.parameters(),
                                     lr=opt.lr,
                                     betas=(0.5, 0.999))

    lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(
        optimizer_G,
        lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step)
    lr_scheduler_D_A = torch.optim.lr_scheduler.LambdaLR(
        optimizer_D_A,
        lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step)
    lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR(
        optimizer_D_B,
        lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step)

    # Inputs & targets memory allocation
    Tensor = torch.cuda.FloatTensor if opt.cuda else torch.Tensor
    input_A = Tensor(opt.batchSize, opt.input_nc, opt.size, opt.size)
    input_B = Tensor(opt.batchSize, opt.output_nc, opt.size, opt.size)
    target_real = Variable(Tensor(opt.batchSize).fill_(1.0),
                           requires_grad=False)
    target_fake = Variable(Tensor(opt.batchSize).fill_(0.0),
                           requires_grad=False)

    fake_A_buffer = ReplayBuffer()
    fake_B_buffer = ReplayBuffer()

    # Dataset loader
    transforms_ = [
        transforms.Resize(int(opt.size * 1.2), Image.BICUBIC),
        transforms.CenterCrop(opt.size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]

    dataloader = DataLoader(ImageDataset(opt.dataroot,
                                         transforms_=transforms_,
                                         unaligned=True),
                            batch_size=opt.batchSize,
                            shuffle=True,
                            num_workers=opt.n_cpu,
                            drop_last=True)

    # Plot Loss and Images in Tensorboard
    experiment_dir = 'logs/{}@{}'.format(
        opt.dataroot.split('/')[1],
        datetime.now().strftime("%d.%m.%Y-%H:%M:%S"))
    os.makedirs(experiment_dir, exist_ok=True)
    writer = SummaryWriter(os.path.join(experiment_dir, "tb"))

    metric_dict = defaultdict(list)
    n_iters_total = 0

    ###################################
    ###### Training ######
    for epoch in range(opt.epoch, opt.n_epochs):
        for i, batch in enumerate(dataloader):

            # Set model input
            real_A = Variable(input_A.copy_(batch['A']))
            real_B = Variable(input_B.copy_(batch['B']))

            ###### Generators A2B and B2A ######
            optimizer_G.zero_grad()

            # Identity loss
            # G_A2B(B) should equal B if real B is fed
            same_B = netG_A2B(real_B)
            loss_identity_B = criterion_identity(
                same_B, real_B) * 5.0  # [batchSize, 3, ImgSize, ImgSize]

            # G_B2A(A) should equal A if real A is fed
            same_A = netG_B2A(real_A)
            loss_identity_A = criterion_identity(
                same_A, real_A) * 5.0  # [batchSize, 3, ImgSize, ImgSize]

            # GAN loss
            fake_B = netG_A2B(real_A)
            pred_fake = netD_B(fake_B).view(-1)
            loss_GAN_A2B = criterion_GAN(pred_fake, target_real)  # [batchSize]

            fake_A = netG_B2A(real_B)
            pred_fake = netD_A(fake_A).view(-1)
            loss_GAN_B2A = criterion_GAN(pred_fake, target_real)  # [batchSize]

            # Cycle loss
            recovered_A = netG_B2A(fake_B)
            loss_cycle_ABA = criterion_cycle(
                recovered_A, real_A) * 10.0  # [batchSize, 3, ImgSize, ImgSize]

            recovered_B = netG_A2B(fake_A)
            loss_cycle_BAB = criterion_cycle(
                recovered_B, real_B) * 10.0  # [batchSize, 3, ImgSize, ImgSize]

            # Total loss
            loss_G = loss_identity_A + loss_identity_B + loss_GAN_A2B + loss_GAN_B2A + loss_cycle_ABA + loss_cycle_BAB

            loss_G.backward()
            optimizer_G.step()
            ###################################

            ###### Discriminator A ######
            optimizer_D_A.zero_grad()

            # Real loss
            pred_real = netD_A(real_A).view(-1)
            loss_D_real = criterion_GAN(pred_real, target_real)  # [batchSize]

            # Fake loss
            fake_A = fake_A_buffer.push_and_pop(fake_A)
            pred_fake = netD_A(fake_A.detach()).view(-1)
            loss_D_fake = criterion_GAN(pred_fake, target_fake)  # [batchSize]

            # Total loss
            loss_D_A = (loss_D_real + loss_D_fake) * 0.5
            loss_D_A.backward()

            optimizer_D_A.step()
            ###################################

            ###### Discriminator B ######
            optimizer_D_B.zero_grad()

            # Real loss
            pred_real = netD_B(real_B).view(-1)
            loss_D_real = criterion_GAN(pred_real, target_real)  # [batchSize]

            # Fake loss
            fake_B = fake_B_buffer.push_and_pop(fake_B)
            pred_fake = netD_B(fake_B.detach()).view(-1)
            loss_D_fake = criterion_GAN(pred_fake, target_fake)  # [batchSize]

            # Total loss
            loss_D_B = (loss_D_real + loss_D_fake) * 0.5
            loss_D_B.backward()

            optimizer_D_B.step()
            ###################################

            metric_dict['loss_G'].append(loss_G.item())
            metric_dict['loss_G_identity'].append(loss_identity_A.item() +
                                                  loss_identity_B.item())
            metric_dict['loss_G_GAN'].append(loss_GAN_A2B.item() +
                                             loss_GAN_B2A.item())
            metric_dict['loss_G_cycle'].append(loss_cycle_ABA.item() +
                                               loss_cycle_BAB.item())
            metric_dict['loss_D'].append(loss_D_A.item() + loss_D_B.item())

            for title, value in metric_dict.items():
                writer.add_scalar('train/{}'.format(title), value[-1],
                                  n_iters_total)

            n_iters_total += 1

        print("""
        -----------------------------------------------------------
        Epoch : {} Finished
        Loss_G : {}
        Loss_G_identity : {}
        Loss_G_GAN : {}
        Loss_G_cycle : {}
        Loss_D : {}
        -----------------------------------------------------------
        """.format(epoch, loss_G, loss_identity_A + loss_identity_B,
                   loss_GAN_A2B + loss_GAN_B2A,
                   loss_cycle_ABA + loss_cycle_BAB, loss_D_A + loss_D_B))

        # Update learning rates
        lr_scheduler_G.step()
        lr_scheduler_D_A.step()
        lr_scheduler_D_B.step()

        # Save models checkpoints

        if loss_G.item() < 2.5:
            os.makedirs(os.path.join(experiment_dir, str(epoch)),
                        exist_ok=True)
            torch.save(netG_A2B.state_dict(),
                       '{}/{}/netG_A2B.pth'.format(experiment_dir, epoch))
            torch.save(netG_B2A.state_dict(),
                       '{}/{}/netG_B2A.pth'.format(experiment_dir, epoch))
            torch.save(netD_A.state_dict(),
                       '{}/{}/netD_A.pth'.format(experiment_dir, epoch))
            torch.save(netD_B.state_dict(),
                       '{}/{}/netD_B.pth'.format(experiment_dir, epoch))
        elif epoch > 100 and epoch % 40 == 0:
            os.makedirs(os.path.join(experiment_dir, str(epoch)),
                        exist_ok=True)
            torch.save(netG_A2B.state_dict(),
                       '{}/{}/netG_A2B.pth'.format(experiment_dir, epoch))
            torch.save(netG_B2A.state_dict(),
                       '{}/{}/netG_B2A.pth'.format(experiment_dir, epoch))
            torch.save(netD_A.state_dict(),
                       '{}/{}/netD_A.pth'.format(experiment_dir, epoch))
            torch.save(netD_B.state_dict(),
                       '{}/{}/netD_B.pth'.format(experiment_dir, epoch))

        for title, value in metric_dict.items():
            writer.add_scalar("train/{}_epoch".format(title), np.mean(value),
                              epoch)
Example #8
0
class Agent(nn.Module):
    def __init__(self, algorithm, writer, device, state_dim, action_dim, args,
                 demonstrations_location_args):
        super(Agent, self).__init__()
        self.writer = writer
        self.device = device
        self.args = args
        if self.args.on_policy == True:
            self.data = ReplayBuffer(action_prob_exist=True,
                                     max_size=self.args.traj_length,
                                     state_dim=state_dim,
                                     num_action=action_dim)
        else:
            self.data = ReplayBuffer(action_prob_exist=False,
                                     max_size=int(self.args.memory_size),
                                     state_dim=state_dim,
                                     num_action=action_dim)
        file_size = 120

        f = open(demonstrations_location_args.expert_state_location, 'rb')
        self.expert_states = torch.tensor(
            np.concatenate([np.load(f) for _ in range(file_size)])).float()
        f = open(demonstrations_location_args.expert_action_location, 'rb')
        self.expert_actions = torch.tensor(
            np.concatenate([np.load(f) for _ in range(file_size)]))
        f = open(demonstrations_location_args.expert_next_state_location, 'rb')
        self.expert_next_states = torch.tensor(
            np.concatenate([np.load(f) for _ in range(file_size)])).float()
        f = open(demonstrations_location_args.expert_done_location, 'rb')
        self.expert_dones = torch.tensor(
            np.concatenate([np.load(f)
                            for _ in range(file_size)])).float().unsqueeze(-1)
        f.close()

        self.brain = algorithm

    def get_action(self, x):
        action, log_prob = self.brain.get_action(x)
        return action, log_prob

    def put_data(self, transition):
        self.data.put_data(transition)

    def train(self,
              discriminator,
              discriminator_batch_size,
              state_rms,
              n_epi,
              batch_size=64):
        if self.args.on_policy:
            data = self.data.sample(shuffle=False)
            states, actions, rewards, next_states, done_masks, old_log_probs = convert_to_tensor(
                self.device, data['state'], data['action'], data['reward'],
                data['next_state'], data['done'], data['log_prob'])
        else:
            data = self.data.sample(shuffle=True,
                                    batch_size=discriminator_batch_size)
            states, actions, rewards, next_states, done_masks = convert_to_tensor(
                self.device, data['state'], data['action'], data['reward'],
                data['next_state'], data['done'])
        if discriminator.name() == 'sqil':
            agent_s, agent_a, agent_next_s, agent_done_mask = make_one_mini_batch(
                batch_size, states, actions, next_states, done_masks)
            expert_s, expert_a, expert_next_s, expert_done = make_one_mini_batch(
                batch_size, self.expert_states, self.expert_actions,
                self.expert_next_states, self.expert_dones)
            expert_done_mask = (1 - expert_done.float())

            discriminator.train_network(self.brain, n_epi, agent_s, agent_a,
                                        agent_next_s, agent_done_mask,
                                        expert_s, expert_a, expert_next_s,
                                        expert_done_mask)
            return
        if discriminator.args.is_airl == False:
            agent_s, agent_a = make_one_mini_batch(discriminator_batch_size,
                                                   states, actions)
            expert_s, expert_a = make_one_mini_batch(discriminator_batch_size,
                                                     self.expert_states,
                                                     self.expert_actions)
            if self.args.on_policy:
                expert_s = np.clip(
                    (expert_s - state_rms.mean) / (state_rms.var**0.5 + 1e-8),
                    -5, 5)
            discriminator.train_network(self.writer, n_epi, agent_s, agent_a,
                                        expert_s, expert_a)
        else:
            agent_s, agent_a, agent_next_s, agent_done_mask = make_one_mini_batch(
                discriminator_batch_size, states, actions, next_states,
                done_masks)
            expert_s, expert_a, expert_next_s, expert_done = make_one_mini_batch(
                discriminator_batch_size, self.expert_states,
                self.expert_actions, self.expert_next_states,
                self.expert_dones)

            expert_done_mask = (1 - expert_done.float())
            if self.args.on_policy:
                expert_s = np.clip(
                    (expert_s - state_rms.mean) / (state_rms.var**0.5 + 1e-8),
                    -5, 5).float()
                expert_next_s = np.clip((expert_next_s - state_rms.mean) /
                                        (state_rms.var**0.5 + 1e-8), -5,
                                        5).float()

            mu, sigma = self.brain.get_dist(agent_s.float().to(self.device))
            dist = torch.distributions.Normal(mu, sigma)
            agent_log_prob = dist.log_prob(agent_a).sum(-1,
                                                        keepdim=True).detach()
            mu, sigma = self.brain.get_dist(expert_s.float().to(self.device))
            dist = torch.distributions.Normal(mu, sigma)
            expert_log_prob = dist.log_prob(expert_a).sum(
                -1, keepdim=True).detach()
            discriminator.train_network(self.writer, n_epi, agent_s, agent_a, agent_next_s,\
                                          agent_log_prob, agent_done_mask, expert_s, expert_a, expert_next_s, expert_log_prob, expert_done_mask)
        if self.args.on_policy:
            self.brain.train_network(self.writer, n_epi, states, actions,
                                     rewards, next_states, done_masks,
                                     old_log_probs)
        else:
            data = self.data.sample(shuffle=True, batch_size=batch_size)
            states, actions, rewards, next_states, done_masks = convert_to_tensor(
                self.device, data['state'], data['action'], data['reward'],
                data['next_state'], data['done'])
            self.brain.train_network(self.writer, n_epi, states, actions,
                                     rewards, next_states, done_masks)
Example #9
0
def train(opt, train_loader, netG, netD):
    epoch = 0
    n_epochs = opt.epochs
    decay_epoch = opt.decay_epoch
    batchSize = opt.b
    size = 128
    input_nc = opt.input_channel
    output_nc = 3
    lr = opt.lr
    if opt.stage != "Refine":
        nRow = 3
    else:
        nRow = 4

    criterion_GAN = torch.nn.MSELoss()
    criterion_identity = torch.nn.L1Loss()

    optimizer_G = torch.optim.Adam(netG.parameters(),
                                   lr=lr,
                                   betas=(0.5, 0.999))
    optimizer_D = torch.optim.Adam(netD.parameters(),
                                   lr=lr,
                                   betas=(0.5, 0.999))

    lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(optimizer_G,
                                                       lr_lambda=LambdaLR(
                                                           n_epochs, epoch,
                                                           decay_epoch).step)
    lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR(optimizer_D,
                                                       lr_lambda=LambdaLR(
                                                           n_epochs, epoch,
                                                           decay_epoch).step)

    # Inputs & targets memory allocation
    Tensor = torch.cuda.FloatTensor
    input_A = Tensor(batchSize, input_nc, size, size)
    target_real = Variable(Tensor(batchSize).fill_(1.0), requires_grad=False)
    target_fake = Variable(Tensor(batchSize).fill_(0.0), requires_grad=False)

    fake_buffer = ReplayBuffer()

    for epoch in range(0, n_epochs):
        gc.collect()
        Source = iter(train_loader)
        avg_loss_g = 0
        avg_loss_d = 0
        for i in range(0, len(train_loader)):
            netG.train()
            target_real = Variable(torch.ones(1, 1),
                                   requires_grad=False).cuda()
            target_fake = Variable(torch.zeros(1, 1),
                                   requires_grad=False).cuda()
            optimizer_G.zero_grad()

            if opt.stage != "Refine":
                src, mask, style_img, target, gt_cloth, skel, cloth = Source.next(
                )
                src, mask, style_img, target, gt_cloth, skel, cloth = Variable(
                    src.cuda()), Variable(mask.cuda()), Variable(
                        style_img.cuda()), Variable(target.cuda()), Variable(
                            gt_cloth.cuda()), Variable(skel.cuda()), Variable(
                                cloth.cuda())
            else:
                src, mask, style_img, target, gt_cloth, wrap, diff, cloth = Source.next(
                )
                src, mask, style_img, target, gt_cloth, wrap, diff, cloth = Variable(
                    src.cuda()), Variable(mask.cuda()), Variable(
                        style_img.cuda()), Variable(target.cuda()), Variable(
                            gt_cloth.cuda()), Variable(wrap.cuda()), Variable(
                                diff.cuda()), Variable(cloth.cuda())

            #Inverse identity
            if opt.stage == "Shape":
                gen_targ, _, _, _, _, _, _ = netG(skel,
                                                  cloth)  # src,conditions
            elif opt.stage == "Stitch":
                gen_targ, _, _, _, _, _, _ = netG(src, style_img, skel)
            elif opt.stage == "Refine":
                gen_targ, _, _, _, _, _, _ = netG(diff, wrap)

            pred_fake = netD(gen_targ)

            if opt.stage == "Shape":
                loss_GAN = 10 * criterion_GAN(
                    pred_fake, target_real) + 10 * criterion_identity(
                        gen_targ, gt_cloth)
            elif opt.stage == "Stitch" or opt.stage == "Refine":
                loss_GAN = 10 * criterion_GAN(
                    pred_fake, target_real) + 10 * criterion_identity(
                        gen_targ, target)

            loss_G = loss_GAN
            loss_G.backward()

            optimizer_G.step()
            #############################################

            optimizer_D.zero_grad()

            if opt.stage == "Shape":
                pred_real = netD(gt_cloth)
            elif opt.stage == "Stitch" or opt.stage == "Refine":
                pred_real = netD(target)

            loss_D_real = criterion_GAN(pred_real, target_real)

            # Fake loss
            gen_targ = fake_buffer.push_and_pop(gen_targ)
            pred_fake = netD(gen_targ.detach())
            loss_D_fake = criterion_GAN(pred_fake, target_fake)

            # Total loss
            loss_D = (loss_D_real + loss_D_fake) * 0.5
            loss_D.backward()
            if (i + 1) % opt.critic == 0:
                optimizer_D.step()

            avg_loss_g = (avg_loss_g + loss_G) / (i + 1)
            avg_loss_d = (avg_loss_d + loss_D) / (i + 1)

            if (i + 1) % 100 == 0:
                print("Epoch: (%3d) (%5d/%5d) Loss: (%0.0003f) (%0.0003f)" %
                      (epoch, i + 1, len(train_loader), avg_loss_g * 1000,
                       avg_loss_d * 1000))

            if (i + 1) % opt.display_count == 0:
                if opt.stage == "Shape":
                    pic = (torch.cat(
                        [style_img, gen_targ, cloth, skel, target, gt_cloth],
                        dim=0).data + 1) / 2.0
                elif opt.stage == "Stitch":
                    pic = (torch.cat(
                        [src, gen_targ, cloth, skel, target, gt_cloth],
                        dim=0).data + 1) / 2.0
                elif opt.stage == "Refine":
                    pic = (torch.cat([wrap, diff, gen_targ, target],
                                     dim=0).data + 1) / 2.0

                save_dir = "{}/{}".format(os.getcwd(), opt.results)
                #             os.mkdir(save_dir)
                save_image(pic,
                           '%s/Epoch_(%d)_(%dof%d).jpg' %
                           (save_dir, epoch, i + 1, len(train_loader)),
                           nrow=nRow)
        if (epoch + 1) % opt.save_model == 0:
            save_dir = "{}/{}".format(os.getcwd(), opt.results)
            torch.save(netG.state_dict(),
                       '{}/Gan_{}.pth'.format(save_dir, epoch))
        # Update learning rates
        lr_scheduler_G.step()
        lr_scheduler_D.step()
Example #10
0
class PPO(nn.Module):
    def __init__(self, writer, device, state_dim, action_dim, args):
        super(PPO,self).__init__()
        self.args = args
        
        self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim)
        self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \
                           self.args.activation_function,self.args.last_activation,self.args.trainable_std)
        self.critic = Critic(self.args.layer_num, state_dim, 1, \
                             self.args.hidden_dim, self.args.activation_function,self.args.last_activation)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr)

        self.writer = writer
        self.device = device
        
    def get_action(self,x):
        mu,sigma = self.actor(x)
        return mu,sigma
    
    def v(self,x):
        return self.critic(x)
    
    def put_data(self,transition):
        self.data.put_data(transition)
        
    def get_gae(self, states, rewards, next_states, dones):
        values = self.v(states).detach()
        td_target = rewards + self.args.gamma * self.v(next_states) * (1 - dones)
        delta = td_target - values
        delta = delta.detach().cpu().numpy()
        advantage_lst = []
        advantage = 0.0
        for idx in reversed(range(len(delta))):
            if dones[idx] == 1:
                advantage = 0.0
            advantage = self.args.gamma * self.args.lambda_ * advantage + delta[idx][0]
            advantage_lst.append([advantage])
        advantage_lst.reverse()
        advantages = torch.tensor(advantage_lst, dtype=torch.float).to(self.device)
        return values, advantages
    
    def train_net(self,n_epi):
        data = self.data.sample(shuffle = False)
        states, actions, rewards, next_states, dones, old_log_probs = convert_to_tensor(self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done'], data['log_prob'])
        
        old_values, advantages = self.get_gae(states, rewards, next_states, dones)
        returns = advantages + old_values
        advantages = (advantages - advantages.mean())/(advantages.std()+1e-3)
        
        for i in range(self.args.train_epoch):
            for state,action,old_log_prob,advantage,return_,old_value \
            in make_mini_batch(self.args.batch_size, states, actions, \
                                           old_log_probs,advantages,returns,old_values): 
                curr_mu,curr_sigma = self.get_action(state)
                value = self.v(state).float()
                curr_dist = torch.distributions.Normal(curr_mu,curr_sigma)
                entropy = curr_dist.entropy() * self.args.entropy_coef
                curr_log_prob = curr_dist.log_prob(action).sum(1,keepdim = True)

                #policy clipping
                ratio = torch.exp(curr_log_prob - old_log_prob.detach())
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1-self.args.max_clip, 1+self.args.max_clip) * advantage
                actor_loss = (-torch.min(surr1, surr2) - entropy).mean() 
                
                #value clipping (PPO2 technic)
                old_value_clipped = old_value + (value - old_value).clamp(-self.args.max_clip,self.args.max_clip)
                value_loss = (value - return_.detach().float()).pow(2)
                value_loss_clipped = (old_value_clipped - return_.detach().float()).pow(2)
                critic_loss = 0.5 * self.args.critic_coef * torch.max(value_loss,value_loss_clipped).mean()
                
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.actor.parameters(), self.args.max_grad_norm)
                self.actor_optimizer.step()
                
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                nn.utils.clip_grad_norm_(self.critic.parameters(), self.args.max_grad_norm)
                self.critic_optimizer.step()
                
                if self.writer != None:
                    self.writer.add_scalar("loss/actor_loss", actor_loss.item(), n_epi)
                    self.writer.add_scalar("loss/critic_loss", critic_loss.item(), n_epi)
Example #11
0
def train(**kwargs):
    opt = Config()
    opt._parse(kwargs)

    transform = tf.Compose([
        tf.Resize(int(1.12 * opt.image_size), Image.BICUBIC),
        tf.RandomCrop(opt.image_size),
        tf.RandomHorizontalFlip(),
        tf.ToTensor(),
        tf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    '''
    Image.NEAREST :低质量
    Image.BILINEAR:双线性
    Image.BICUBIC :三次样条插值
    Image.ANTIALIAS:高质量
    '''
    # 读取数据
    trian_data = ImageDataset(opt.dataroot, transforms=transform, istrain=True)
    train_loader = DataLoader(trian_data,
                              batch_size=opt.batch_size,
                              shuffle=True,
                              num_workers=opt.num_workers)
    # 实例化网络
    G_A2B = CycleGan.generator()
    G_B2A = CycleGan.generator()

    D_A = CycleGan.discriminator()
    D_B = CycleGan.discriminator()

    if t.cuda.is_available():
        G_A2B.cuda()
        G_B2A.cuda()
        D_A.cuda()
        D_B.cuda()

    # 初始化网络
    G_A2B.weight_init()
    G_B2A.weight_init()
    D_A.weight_init()
    D_B.weight_init()

    # 定义loss
    criterion_GAN = t.nn.MSELoss()
    criterion_Cycle = t.nn.L1Loss()
    criterion_identity = t.nn.L1Loss()

    # 定义优化器
    optimizer_G = t.optim.Adam(itertools.chain(G_A2B.parameters(),
                                               G_B2A.parameters()),
                               lr=opt.lr,
                               betas=(opt.betas, 0.999))
    optimizer_D = t.optim.Adam(itertools.chain(D_A.parameters(),
                                               D_B.parameters()),
                               lr=opt.lr,
                               betas=(opt.betas, 0.999))

    # 定义动态改变学习率
    lr_schedule_G = t.optim.lr_scheduler.LambdaLR(optimizer_G,
                                                  lr_lambda=LambdaLR(
                                                      opt.max_epoch, 0,
                                                      opt.decay_epoch).step)
    lr_schedule_D = t.optim.lr_scheduler.LambdaLR(optimizer_D,
                                                  lr_lambda=LambdaLR(
                                                      opt.max_epoch, 0,
                                                      opt.decay_epoch).step)

    # 输入输出,标签
    Tensor = t.cuda.FloatTensor if t.cuda.is_available() else t.Tensor
    input_A = Tensor(opt.batch_size, 3, opt.image_size, opt.image_size)
    input_B = Tensor(opt.batch_size, 3, opt.image_size, opt.image_size)
    target_real = t.ones(opt.batch_size, 1).cuda()
    target_fake = t.zeros(opt.batch_size, 1).cuda()

    fake_A_buffer = ReplayBuffer()
    fake_B_buffer = ReplayBuffer()

    # 定义可视化visdom
    vis = Visualizer(env=opt.env, port=15024)

    # 定义averagemeter
    lossG_A2B_meter = meter.AverageValueMeter()
    lossG_B2A_meter = meter.AverageValueMeter()
    lossG_identity_meter = meter.AverageValueMeter()
    lossG_cycle_meter = meter.AverageValueMeter()
    lossD_B_meter = meter.AverageValueMeter()
    lossD_A_meter = meter.AverageValueMeter()

    # 开始训练
    lam = 10
    for epoch in range(opt.max_epoch):
        lossD_A_meter.reset()
        lossD_B_meter.reset()
        lossG_cycle_meter.reset()
        lossG_identity_meter.reset()
        lossG_B2A_meter.reset()
        lossG_A2B_meter.reset()
        for i, batch in tqdm.tqdm(enumerate(train_loader)):

            real_A = input_A.copy_(batch['A']).cuda()
            real_B = input_B.copy_(batch['B']).cuda()
            # print(real_A.requires_grad)
            # 训练生成器
            # 生成器A2b,生成器B2A
            optimizer_G.zero_grad()

            # identity loss
            # G_A2B(B)=B if B is real
            same_B = G_A2B(real_B)
            loss_identity_B = criterion_identity(same_B, real_B) * 0.5 * lam
            # the same as above
            same_A = G_B2A(real_A)
            loss_identity_A = criterion_identity(same_A, real_A) * 0.5 * lam
            lossG_identity_meter.add(loss_identity_A.item() +
                                     loss_identity_B.item())

            # GAN loss
            fake_B = G_A2B(real_A)
            prob_fakeB = D_B(fake_B)
            loss_GAN_A2B = criterion_GAN(prob_fakeB, target_real)
            lossG_A2B_meter.add(loss_GAN_A2B.item())

            fake_A = G_B2A(real_B)
            prob_fakeA = D_A(fake_A)
            loss_GAN_B2A = criterion_GAN(prob_fakeA, target_real)
            lossG_B2A_meter.add(loss_GAN_B2A.item())
            # Cycle loss
            recoverA = G_B2A(fake_B)
            loss_cycle_ABA = criterion_Cycle(recoverA, real_A) * lam

            recoverB = G_A2B(fake_A)
            loss_cycle_BAB = criterion_Cycle(recoverB, real_B) * lam
            lossG_cycle_meter.add(loss_cycle_BAB.item() +
                                  loss_cycle_ABA.item())
            # total loss
            loss_G = loss_identity_A + loss_identity_B + loss_GAN_A2B + loss_GAN_B2A + loss_cycle_ABA + loss_cycle_BAB
            loss_G.backward()
            optimizer_G.step()

            # 训练判别器
            optimizer_D.zero_grad()

            # real loss
            pred_real_B = D_B(real_B)
            loss_D_real_B = criterion_GAN(pred_real_B, target_real)

            # fake loss ,fake from buffer
            fake_B_new = fake_B_buffer.push_and_pop(fake_B)
            pred_fake_B = D_B(fake_B_new)
            loss_D_fake_B = criterion_GAN(pred_fake_B, target_fake)
            loss_total_B = (loss_D_real_B + loss_D_fake_B) * 0.5
            lossD_B_meter.add(loss_total_B.item())
            loss_total_B.backward()

            # real loss
            pred_real_A = D_A(real_A)
            loss_D_real_A = criterion_GAN(pred_real_A, target_real)

            # fakr loss ,fake from buffer
            fake_A_new = fake_A_buffer.push_and_pop(fake_A)
            pred_fake_A = D_A(fake_A_new)
            loss_D_fake_A = criterion_GAN(pred_fake_A, target_fake)
            loss_total_A = (loss_D_fake_A + loss_D_real_A) * 0.5
            lossD_A_meter.add(loss_total_A.item())
            loss_total_A.backward()

            optimizer_D.step()
            ###打印可视化
            if (i + 1) % opt.plot_every == 0:
                vis.plot('lossG_A2B', lossG_A2B_meter.value()[0])
                vis.plot('lossG_B2A', lossG_B2A_meter.value()[0])
                vis.plot('lossG_identity', lossG_identity_meter.value()[0])
                vis.plot('lossG_cycle', lossG_cycle_meter.value()[0])
                vis.plot('lossD_B', lossD_B_meter.value()[0])
                vis.plot('lossD_A', lossD_A_meter.value()[0])
                vis.img('real_A', real_A.data.cpu()[0] * 0.5 + 0.5)
                vis.img('fake_B', fake_B.data.cpu()[0] * 0.5 + 0.5)
                vis.img('real_B', real_B.data.cpu()[0] * 0.5 + 0.5)
                vis.img('fake_A', fake_A.data.cpu()[0] * 0.5 + 0.5)
        # 更新学习率
        lr_schedule_G.step()
        lr_schedule_D.step()

        # 保存模型m
        if (epoch + 1) % opt.savemode_every == 0:
            t.save(
                G_A2B.state_dict(), 'checkpoints/%s_%s_G_A2B.pth' %
                (epoch, time.strftime('%m%d_%H:%M%S')))
            t.save(
                G_B2A.state_dict(), 'checkpoints/%s_%s_G_B2A.pth' %
                (epoch, time.strftime('%m%d_%H:%M%S')))
            t.save(
                D_A.state_dict(), 'checkpoints/%s_%s_D_A.pth' %
                (epoch, time.strftime('%m%d_%H:%M%S')))
            t.save(
                D_B.state_dict(), 'checkpoints/%s_%s_D_B.pth' %
                (epoch, time.strftime('%m%d_%H:%M%S')))