def generate_inverted_image_specific_layer(self, input_image, img_size, target_layer=3):
        # Generate a random image which we will optimize
        opt_img = Variable(1e-1 * torch.randn(1, 3, img_size, img_size), requires_grad=True)
        # Define optimizer for previously created image
        optimizer = SGD([opt_img], lr=1e4, momentum=0.9)
        # Get the output from the model after a forward pass until target_layer
        # with the input image (real image, NOT the randomly generated one)
        input_image_layer_output = \
            self.get_output_from_specific_layer(input_image, target_layer)

        # Alpha regularization parametrs
        # Parameter alpha, which is actually sixth norm
        alpha_reg_alpha = 6
        # The multiplier, lambda alpha
        alpha_reg_lambda = 1e-7

        # Total variation regularization parameters
        # Parameter beta, which is actually second norm
        tv_reg_beta = 2
        # The multiplier, lambda beta
        tv_reg_lambda = 1e-8

        for i in range(201):
            optimizer.zero_grad()
            # Get the output from the model after a forward pass until target_layer
            # with the generated image (randomly generated one, NOT the real image)
            output = self.get_output_from_specific_layer(opt_img, target_layer)
            # Calculate euclidian loss
            euc_loss = 1e-1 * self.euclidian_loss(input_image_layer_output.detach(), output)
            # Calculate alpha regularization
            reg_alpha = alpha_reg_lambda * self.alpha_norm(opt_img, alpha_reg_alpha)
            # Calculate total variation regularization
            reg_total_variation = tv_reg_lambda * self.total_variation_norm(opt_img,
                                                                            tv_reg_beta)
            # Sum all to optimize
            loss = euc_loss + reg_alpha + reg_total_variation
            # Step
            loss.backward()
            optimizer.step()
            # Generate image every 5 iterations
            if i % 5 == 0:
                print('Iteration:', str(i), 'Loss:', loss.data.numpy()[0])
                x = recreate_image(opt_img)
                cv2.imwrite('../generated/Inv_Image_Layer_' + str(target_layer) +
                            '_Iteration_' + str(i) + '.jpg', x)
            # Reduce learning rate every 40 iterations
            if i % 40 == 0:
                for param_group in optimizer.param_groups:
                    param_group['lr'] *= 1/10
Beispiel #2
0
def test_mask_same_after_update(generate_batch):
    from torch.optim import SGD
    unary, tags, lengths = generate_batch
    h = unary.size(2)
    constraint = torch.rand(h, h) < 0.5
    crf = CRF(h, constraint=constraint)
    opt = SGD(crf.parameters(), lr=10)
    m1 = crf.constraint.numpy()
    t1 = crf.transitions_p.detach().clone().numpy()
    l = crf.neg_log_loss(unary, tags, lengths)
    l = torch.mean(l)
    l.backward()
    opt.step()
    m2 = crf.constraint.numpy()
    t2 = crf.transitions_p.detach().numpy()
    np.testing.assert_allclose(m1, m2)
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(t1, t2)
 def generate(self):
     initial_learning_rate = 6
     for i in range(1, 150):
         # Process image and return variable
         self.processed_image = preprocess_image(self.created_image)
         # Define optimizer for the image
         optimizer = SGD([self.processed_image], lr=initial_learning_rate)
         # Forward
         output = self.model(self.processed_image)
         # Target specific class
         class_loss = -output[0, self.target_class]
         print('Iteration:', str(i), 'Loss', "{0:.2f}".format(class_loss.data.numpy()[0]))
         # Zero grads
         self.model.zero_grad()
         # Backward
         class_loss.backward()
         # Update image
         optimizer.step()
         # Recreate image
         self.created_image = recreate_image(self.processed_image)
         # Save image
         cv2.imwrite('../generated/c_specific_iteration_'+str(i)+'.jpg', self.created_image)
     return self.processed_image
Beispiel #4
0
def train_siamese_distrib_margine(directory,
                                  version,
                                  model,
                                  train_loader,
                                  valid_loader,
                                  resize,
                                  batch_size,
                                  margine,
                                  exp_name='model_1',
                                  decay=None,
                                  lr=0.0001,
                                  epochs=10,
                                  momentum=0.99,
                                  logdir='logs',
                                  modeLoss=None,
                                  dizionario_array=None):
    print("momonetum", momentum)
    print("lr", lr)
    print(margine)

    if not modeLoss is None:
        if modeLoss == "single":
            criterion = ContrastiveLoss(margine)

    if not decay is None:
        print("Weight_Decay", decay)
        optimizer = SGD(model.parameters(),
                        lr,
                        momentum=momentum,
                        weight_decay=decay)
    else:
        optimizer = SGD(model.parameters(), lr, momentum=momentum)

    if not dizionario_array is None:
        optimizer.load_state_dict(dizionario_array["optimizer"])
    #meters
    loss_meter = AverageValueMeter()
    acc_meter = AverageValueMeter()
    #writer
    writer = SummaryWriter(join(logdir, exp_name))
    #device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    criterion.to(device)
    #definiamo un dizionario contenente i loader di training e test
    loader = {'train': train_loader, 'valid': valid_loader}

    if not dizionario_array is None:
        array_accuracy_train = dizionario_array["a_train"]
        array_accuracy_valid = dizionario_array["a_valid"]
        array_loss_train = dizionario_array["l_train"]
        array_loss_valid = dizionario_array["l_valid"]
        array_glb_train = dizionario_array["g_train"]
        array_glb_valid = dizionario_array["g_valid"]
        global_step = array_glb_valid[-1]
        last_loss_train = array_loss_train[-1]
        last_loss_val = array_loss_valid[-1]
        last_acc_train = array_accuracy_train[-1]
        last_acc_val = array_accuracy_valid[-1]
        epoche_fatte = dizionario_array["epoche_fatte"]
        epoche_avanza = dizionario_array["epoche_avanza"]

    else:
        array_accuracy_train = []
        array_accuracy_valid = []
        array_loss_train = []
        array_loss_valid = []
        array_glb_train = []
        array_glb_valid = []
        global_step = 0
        last_loss_train = 0
        last_loss_val = 0
        last_acc_train = 0
        last_acc_val = 0
    #inizializziamo il global step

    tempo = Timer()
    start = timer()

    for e in range(epochs):
        print("Epoca= ", e)
        #iteriamo tra due modalità: train e test
        for mode in ['train', 'valid']:
            loss_meter.reset()
            acc_meter.reset()
            model.train() if mode == 'train' else model.eval()
            with torch.set_grad_enabled(
                    mode == 'train'):  #abilitiamo i gradienti solo in training

                for i, batch in enumerate(loader[mode]):
                    print("Num batch =", i)
                    I_i, I_j, l_ij, _, _ = [b.to(device) for b in batch]
                    #img1, img2, label12, label1, label2
                    #l'implementazione della rete siamese è banale:
                    #eseguiamo la embedding net sui due input
                    phi_i = model(I_i)  #img 1
                    phi_j = model(I_j)  #img2

                    print("Output train img1", phi_i.size())
                    print("Output train img2", phi_j.size())
                    #print("Etichetta reale",l_ij)
                    euclidean_distance = F.pairwise_distance(phi_i, phi_j)

                    euclid_tmp = torch.Tensor.numpy(
                        euclidean_distance.detach().cpu())  # distanza
                    labs = l_ij.to('cpu').numpy()  # etichette reali
                    print(euclid_tmp)
                    etichette_predette = [euclid_tmp > margine]

                    print(etichette_predette)
                    etichette_predette = np.int8(etichette_predette)
                    etichette_predette = np.reshape(etichette_predette, -1)
                    print(etichette_predette)

                    #l_ij = l_ij.type(torch.LongTensor).to(device)
                    #calcoliamo la loss
                    l = criterion(phi_i, phi_j, l_ij)

                    #aggiorniamo il global_step
                    #conterrà il numero di campioni visti durante il training
                    n = I_i.shape[0]  #numero di elementi nel batch
                    #print("numero elementi nel batch ",n)
                    global_step += n

                    if mode == 'train':
                        l.backward()
                        optimizer.step()
                        optimizer.zero_grad()

                    acc = accuracy_score(np.array(labs),
                                         np.array(etichette_predette))
                    n = batch[0].shape[0]
                    loss_meter.add(l.item(), n)
                    acc_meter.add(acc, n)
                    #loggiamo i risultati iterazione per iterazione solo durante il training
                    if mode == 'train':
                        writer.add_scalar('loss/train',
                                          loss_meter.value(),
                                          global_step=global_step)
                        writer.add_scalar('accuracy/train',
                                          acc_meter.value(),
                                          global_step=global_step)
                    #una volta finita l'epoca (sia nel caso di training che test, loggiamo le stime finali)

            if mode == 'train':
                global_step_train = global_step
                last_loss_train = loss_meter.value()
                last_acc_train = acc_meter.value()

                array_accuracy_train.append(acc_meter.value())
                array_loss_train.append(loss_meter.value())
                array_glb_train.append(global_step)

            else:
                global_step_val = global_step
                last_loss_val = loss_meter.value()
                last_acc_val = acc_meter.value()

                array_accuracy_valid.append(acc_meter.value())
                array_loss_valid.append(loss_meter.value())
                array_glb_valid.append(global_step)

            writer.add_scalar('loss/' + mode,
                              loss_meter.value(),
                              global_step=global_step)
            writer.add_scalar('accuracy/' + mode,
                              acc_meter.value(),
                              global_step=global_step)

        print("Loss TRAIN", array_loss_train)
        print("Losss VALID", array_loss_valid)
        print("Accuracy TRAIN", array_accuracy_train)
        print("Accuracy VALID", array_accuracy_valid)
        print("dim acc train", len(array_accuracy_train))
        print("dim acc valid", len(array_accuracy_valid))
        plt.figure(figsize=(12, 8))
        plt.plot(array_glb_train, array_accuracy_train)
        plt.plot(array_glb_valid, array_accuracy_valid)
        plt.xlabel('samples')
        plt.ylabel('accuracy')
        plt.grid()
        plt.legend(['Training', 'Valid'])
        plt.savefig(directory + '//plotAccuracy_' + version + '.png')
        plt.show()

        plt.figure(figsize=(12, 8))
        plt.plot(array_glb_train, array_loss_train)
        plt.plot(array_glb_valid, array_loss_valid)
        plt.xlabel('samples')
        plt.ylabel('loss')
        plt.grid()
        plt.legend(['Training', 'Valid'])
        plt.savefig(directory + '//plotLoss_' + version + '.png')
        plt.show()

        saveArray(directory, version, array_loss_train, array_loss_valid,
                  array_accuracy_train, array_accuracy_valid, array_glb_train,
                  array_glb_valid)

        saveinFileJson(start, directory, version, resize, batch_size, e, lr,
                       momentum, len(train_loader), array_accuracy_train[-1],
                       array_accuracy_valid[-1], array_loss_train[-1],
                       array_loss_valid[-1])

        #writer.add_embedding(phi_i, batch[3], I_i, global_step=global_step, tag=exp_name+'_embedding')
        #conserviamo i pesi del modello alla fine di un ciclo di training e test
        net_save(epochs, model, optimizer, last_loss_train, last_loss_val,
                 last_acc_train, last_acc_val, global_step_train,
                 global_step_val, '%s.pth' % (exp_name + "_dict"))
        torch.save(model, '%s.pth' % exp_name)
        torch.save(
            model, directory + "//" + version + "//" + '%s.pth' %
            (exp_name + "_" + str(e)))
    f = '{:.7f}'.format(tempo.stop())
    return model, f, last_loss_train, last_loss_val, last_acc_train, last_acc_val
Beispiel #5
0
def train(train_source_iter: ForeverDataIterator,
          train_target_iter: ForeverDataIterator, G: nn.Module,
          F1: ImageClassifierHead, F2: ImageClassifierHead, optimizer_g: SGD,
          optimizer_f: SGD, epoch: int, args: argparse.Namespace):
    batch_time = AverageMeter('Time', ':3.1f')
    data_time = AverageMeter('Data', ':3.1f')
    losses = AverageMeter('Loss', ':3.2f')
    trans_losses = AverageMeter('Trans Loss', ':3.2f')
    cls_accs = AverageMeter('Cls Acc', ':3.1f')
    tgt_accs = AverageMeter('Tgt Acc', ':3.1f')

    progress = ProgressMeter(
        args.iters_per_epoch,
        [batch_time, data_time, losses, trans_losses, cls_accs, tgt_accs],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    G.train()
    F1.train()
    F2.train()

    end = time.time()
    for i in range(args.iters_per_epoch):
        x_s, labels_s = next(train_source_iter)
        x_t, labels_t = next(train_target_iter)

        x_s = x_s.to(device)
        x_t = x_t.to(device)
        labels_s = labels_s.to(device)
        labels_t = labels_t.to(device)
        x = torch.cat((x_s, x_t), dim=0)
        assert x.requires_grad is False

        # measure data loading time
        data_time.update(time.time() - end)

        # Step A train all networks to minimize loss on source domain
        optimizer_g.zero_grad()
        optimizer_f.zero_grad()

        g = G(x)
        y_1 = F1(g)
        y_2 = F2(g)
        y1_s, y1_t = y_1.chunk(2, dim=0)
        y2_s, y2_t = y_2.chunk(2, dim=0)

        y1_t, y2_t = F.softmax(y1_t, dim=1), F.softmax(y2_t, dim=1)
        loss = F.cross_entropy(y1_s, labels_s) + F.cross_entropy(y2_s, labels_s) + \
               0.01 * (entropy(y1_t) + entropy(y2_t))
        loss.backward()
        optimizer_g.step()
        optimizer_f.step()

        # Step B train classifier to maximize discrepancy
        optimizer_g.zero_grad()
        optimizer_f.zero_grad()

        g = G(x)
        y_1 = F1(g)
        y_2 = F2(g)
        y1_s, y1_t = y_1.chunk(2, dim=0)
        y2_s, y2_t = y_2.chunk(2, dim=0)
        y1_t, y2_t = F.softmax(y1_t, dim=1), F.softmax(y2_t, dim=1)
        loss = F.cross_entropy(y1_s, labels_s) + F.cross_entropy(y2_s, labels_s) + \
               0.01 * (entropy(y1_t) + entropy(y2_t)) - classifier_discrepancy(y1_t, y2_t) * args.trade_off
        loss.backward()
        optimizer_f.step()

        # Step C train genrator to minimize discrepancy
        for k in range(args.num_k):
            optimizer_g.zero_grad()
            g = G(x)
            y_1 = F1(g)
            y_2 = F2(g)
            y1_s, y1_t = y_1.chunk(2, dim=0)
            y2_s, y2_t = y_2.chunk(2, dim=0)
            y1_t, y2_t = F.softmax(y1_t, dim=1), F.softmax(y2_t, dim=1)
            mcd_loss = classifier_discrepancy(y1_t, y2_t) * args.trade_off
            mcd_loss.backward()
            optimizer_g.step()

        cls_acc = accuracy(y1_s, labels_s)[0]
        tgt_acc = accuracy(y1_t, labels_t)[0]

        losses.update(loss.item(), x_s.size(0))
        cls_accs.update(cls_acc.item(), x_s.size(0))
        tgt_accs.update(tgt_acc.item(), x_t.size(0))
        trans_losses.update(mcd_loss.item(), x_s.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
Beispiel #6
0
def train(opt):
    from tensorboardX import SummaryWriter
    writer = SummaryWriter(path_output)

    source1, source2, source3, target = taskSelect(opt.target)

    dataset_s1 = dataset.DA(dir=root, name=source1, img_size=(224, 224), train=True)
    dataset_s2 = dataset.DA(dir=root, name=source2, img_size=(224, 224), train=True)
    dataset_s3 = dataset.DA(dir=root, name=source3, img_size=(224, 224), train=True)
    dataset_t = dataset.DA(dir=root, name=target, img_size=(224, 224), train=True)
    dataset_tt = dataset.DA(dir=root, name=target, img_size=(224,224), train=False,real_val=False)

    dataloader_s1 = DataLoader(dataset_s1, batch_size=opt.bs, shuffle=True, num_workers=2)
    dataloader_s2 = DataLoader(dataset_s2, batch_size=opt.bs, shuffle=True, num_workers=2)
    dataloader_s3 = DataLoader(dataset_s3, batch_size=opt.bs, shuffle=True, num_workers=2)
    dataloader_t = DataLoader(dataset_t, batch_size=opt.bs, shuffle=True, num_workers=2)
    dataloader_tt = DataLoader(dataset_tt, batch_size=opt.bs, shuffle=False, num_workers=2)


    # dataset_s1 = dataset.DA(dir=root, name=source1, img_size=(224, 224), train=True)
    # dataset_s2 = dataset.DA(dir=root, name=source2, img_size=(224, 224), train=True)
    # dataset_s3 = dataset.DA(dir=root, name=source3, img_size=(224, 224), train=True)
    # dataset_t = dataset.DA(dir=root, name=target, img_size=(224, 224), train=True)

    # if target == 'real':
    #     tmp = os.path.join(root, 'test')
    #     dataset_tt = dataset.DA_test(dir=tmp, img_size=(224,224))
    # else:
    #     dataset_tt = dataset.DA(dir=root, name=target, img_size=(224, 224), train=False)

    # dataloader_s1 = DataLoader(dataset_s1, batch_size=opt.bs, shuffle=True, num_workers=2)
    # dataloader_s2 = DataLoader(dataset_s2, batch_size=opt.bs, shuffle=True, num_workers=2)
    # dataloader_s3 = DataLoader(dataset_s3, batch_size=opt.bs, shuffle=True, num_workers=2)
    # dataloader_t = DataLoader(dataset_t, batch_size=opt.bs, shuffle=True, num_workers=2)
    # dataloader_tt = DataLoader(dataset_tt, batch_size=opt.bs, shuffle=False, num_workers=2)


    len_data = min(len(dataset_s1), len(dataset_s2), len(dataset_s3), len(dataset_t))           # length of "shorter" domain
    len_bs = min(len(dataloader_s1), len(dataloader_s2), len(dataloader_s3), len(dataloader_t))

    # Define networks
    feature_extractor = models.feature_extractor()
    classifier_1 = models.class_classifier()
    classifier_2 = models.class_classifier()
    classifier_3 = models.class_classifier()
    classifier_1_ = models.class_classifier()
    classifier_2_ = models.class_classifier()
    classifier_3_ = models.class_classifier()

    # if torch.cuda.is_available():
    feature_extractor = feature_extractor.to(device)
    classifier_1 = classifier_1.to(device).apply(weight_init)
    classifier_2 = classifier_2.to(device).apply(weight_init)
    classifier_3 = classifier_3.to(device).apply(weight_init)
    classifier_1_ = classifier_1_.to(device).apply(weight_init)
    classifier_2_ = classifier_2_.to(device).apply(weight_init)
    classifier_3_ = classifier_3_.to(device).apply(weight_init)

    # Define loss
    mom_loss = momentumLoss()
    cl_loss = nn.CrossEntropyLoss()
    disc_loss = discrepancyLoss()

    # Optimizers
    # Change the LR
    optimizer_features = SGD(feature_extractor.parameters(), lr=0.0001,momentum=0.9,weight_decay=5e-4)
    optimizer_classifier = SGD(([{'params': classifier_1.parameters()},
                    {'params': classifier_2.parameters()},
                    {'params': classifier_3.parameters()}]), lr=0.002,momentum=0.9,weight_decay=5e-4)

    optimizer_classifier_ = SGD(([{'params': classifier_1_.parameters()},
                    {'params': classifier_2_.parameters()},
                    {'params': classifier_3_.parameters()}]), lr=0.002,momentum=0.9,weight_decay=5e-4)

    # optimizer_features = SGD(feature_extractor.parameters(), lr=0.0001)
    # optimizer_classifier = Adam(([{'params': classifier_1.parameters()},
    #                    {'params': classifier_2.parameters()},
    #                    {'params': classifier_3.parameters()}]), lr=0.002)
    # optimizer_classifier_ = Adam(([{'params': classifier_1_.parameters()},
    #                    {'params': classifier_2_.parameters()},
    #                    {'params': classifier_3_.parameters()}]), lr=0.002)

    if opt.pretrain is not None:
        state = torch.load(opt.pretrain)
        feature_extractor.load_state_dict(state['feature_extractor'])
        classifier_1.load_state_dict(state['{}_classifier'.format(source1)])
        classifier_2.load_state_dict(state['{}_classifier'.format(source2)])
        classifier_3.load_state_dict(state['{}_classifier'.format(source3)])
        classifier_1_.load_state_dict(state['{}_classifier_'.format(source1)])
        classifier_2_.load_state_dict(state['{}_classifier_'.format(source2)])
        classifier_3_.load_state_dict(state['{}_classifier_'.format(source3)])

    # Lists
    train_loss = []
    acc_on_target = []

    tot_loss, tot_clf_loss, tot_mom_loss, tot_s2_loss, tot_s3_loss = 0.0, 0.0, 0.0, 0.0, 0.0
    n_samples, iteration = 0, 0
    tot_correct = [0, 0, 0, 0, 0, 0]
    saved_time = time.time()
    feature_extractor.train()
    classifier_1.train(), classifier_2.train(), classifier_3.train()
    classifier_1_.train(), classifier_2_.train(), classifier_3_.train()

    for epoch in range(opt.ep):

        if epoch+1 == 5:
            optimizer_classifier = SGD(([{'params': classifier_1.parameters()},
                    {'params': classifier_2.parameters()},
                    {'params': classifier_3.parameters()}]), lr=0.001,momentum=0.9,weight_decay=5e-4)

            optimizer_classifier_ = SGD(([{'params': classifier_1_.parameters()},
                            {'params': classifier_2_.parameters()},
                            {'params': classifier_3_.parameters()}]), lr=0.001,momentum=0.9,weight_decay=5e-4)

        if epoch+1 == 10:
            optimizer_classifier = SGD(([{'params': classifier_1.parameters()},
                    {'params': classifier_2.parameters()},
                    {'params': classifier_3.parameters()}]), lr=0.0001,momentum=0.9,weight_decay=5e-4)

            optimizer_classifier_ = SGD(([{'params': classifier_1_.parameters()},
                            {'params': classifier_2_.parameters()},
                            {'params': classifier_3_.parameters()}]), lr=0.0001,momentum=0.9,weight_decay=5e-4)


        for i, (data_1, data_2, data_3, data_t) in enumerate(zip(dataloader_s1, dataloader_s2, dataloader_s3, dataloader_t)):

            img1, lb1 = data_1
            img2, lb2 = data_2
            img3, lb3 = data_3
            imgt, _ = data_t

            # Prepare data
            cur_batch = min(img1.shape[0], img2.shape[0], img3.shape[0], imgt.shape[0])
            # print(i, cur_batch)
            img1, lb1 = Variable(img1[0:cur_batch,:,:,:]).to(device), Variable(lb1[0:cur_batch]).to(device)
            img2, lb2 = Variable(img2[0:cur_batch,:,:,:]).to(device), Variable(lb2[0:cur_batch]).to(device)
            img3, lb3 = Variable(img3[0:cur_batch,:,:,:]).to(device), Variable(lb3[0:cur_batch]).to(device)
            imgt = Variable(imgt[0:cur_batch,:,:,:]).to(device)

            ### STEP 1 ### train G and C pairs
            # Forward
            optimizer_features.zero_grad()
            optimizer_classifier.zero_grad()
            optimizer_classifier_.zero_grad()

            # Extract Features
            ft1 = feature_extractor(img1)
            ft2 = feature_extractor(img2)
            ft3 = feature_extractor(img3)
            ft_t = feature_extractor(imgt)

            # Class Prediction [bs, 345]
            cl1, cl1_ = classifier_1(ft1), classifier_1_(ft1)
            cl2, cl2_ = classifier_2(ft2), classifier_2_(ft2)
            cl3, cl3_ = classifier_3(ft3), classifier_3_(ft3)

            # Compute "momentum loss"
            loss_mom = mom_loss(ft1, ft2, ft3, ft_t)

            # Cross entropy loss
            l1, l1_ = cl_loss(cl1, lb1), cl_loss(cl1_, lb1)
            l2, l2_ = cl_loss(cl2, lb2), cl_loss(cl2_, lb2)
            l3, l3_ = cl_loss(cl3, lb3), cl_loss(cl3_, lb3)
            # total loss
            s1loss = l1 + l2 + l3 + l1_ + l2_ + l3_ + opt.alpha * loss_mom

            s1loss.backward()
            optimizer_features.step()
            optimizer_classifier.step()
            optimizer_classifier_.step()

            ### STEP 2 ### fix G, and train C pairs 
            optimizer_classifier.zero_grad()
            optimizer_classifier_.zero_grad()

            # Class Prediction on each src domain
            cl1, cl1_ = classifier_1(ft1.detach()), classifier_1_(ft1.detach())
            cl2, cl2_ = classifier_2(ft2.detach()), classifier_2_(ft2.detach())
            cl3, cl3_ = classifier_3(ft3.detach()), classifier_3_(ft3.detach())

            # discrepancy on tgt domain
            clt1, clt1_ = classifier_1(ft_t.detach()), classifier_1_(ft_t.detach())
            clt2, clt2_ = classifier_2(ft_t.detach()), classifier_2_(ft_t.detach())
            clt3, clt3_ = classifier_3(ft_t.detach()), classifier_3_(ft_t.detach())

            # classification loss
            l1, l1_ = cl_loss(cl1, lb1), cl_loss(cl1_, lb1)
            l2, l2_ = cl_loss(cl2, lb2), cl_loss(cl2_, lb2)
            l3, l3_ = cl_loss(cl3, lb3), cl_loss(cl3_, lb3)

            # print(clt1.shape)
            dl1 = disc_loss(clt1, clt1_)
            dl2 = disc_loss(clt2, clt2_)
            dl3 = disc_loss(clt3, clt3_)
            # print(dl1, dl2, dl3)

            # backward
            s2loss = l1 + l2 + l3 + l1_ + l2_ + l3_ - dl1 - dl2 - dl3
            s2loss.backward()
            optimizer_classifier.step()
            optimizer_classifier_.step()

            ### STEP 3 #### fix C pairs, train G
            optimizer_features.zero_grad()

            ft_t = feature_extractor(imgt)
            clt1, clt1_ = classifier_1(ft_t), classifier_1_(ft_t)
            clt2, clt2_ = classifier_2(ft_t), classifier_2_(ft_t)
            clt3, clt3_ = classifier_3(ft_t), classifier_3_(ft_t)

            dl1 = disc_loss(clt1, clt1_)
            dl2 = disc_loss(clt2, clt2_)
            dl3 = disc_loss(clt3, clt3_)

            s3loss = dl1 + dl2 + dl3
            s3loss.backward()
            optimizer_features.step()
            


            pred = torch.stack((cl1, cl2, cl3, cl1_, cl2_, cl3_), 0) # [6, bs, 345]
            _, pred = torch.max(pred, dim = 2) # [6, bs]
            gt = torch.stack((lb1, lb2, lb3, lb1, lb2, lb3), 0) # [6, bs]
            correct = pred.eq(gt.data)
            correct = torch.mean(correct.type(torch.FloatTensor), dim = 1).cpu().numpy()

            tot_loss += s1loss.item() * cur_batch
            tot_clf_loss += (s1loss.item() - opt.alpha * loss_mom.item()) * cur_batch
            tot_s2_loss += s2loss.item() * cur_batch
            tot_s3_loss += s3loss.item() * cur_batch
            tot_mom_loss += loss_mom.item() * cur_batch
            tot_correct += correct * cur_batch
            n_samples += cur_batch

            # print(cur_batch)
            if iteration % opt.log_interval == 0:
                current_time = time.time()
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tClfLoss: {:.4f}\tMMLoss: {:.4f}\t \
                    S2Loss: {:.4f}\tS3Loss: {:.4f}\t \
                    Accu: {:.4f}\\{:.4f}\\{:.4f}\\{:.4f}\\{:.4f}\\{:.4f}\tTime: {:.3f}'.format(\
                        epoch, i * opt.bs, len_data, 100. * i / len_bs, \
                        tot_clf_loss / n_samples, 
                        tot_mom_loss / n_samples,
                        tot_s2_loss / n_samples,
                        tot_s3_loss / n_samples,
                        tot_correct[0] / n_samples,
                        tot_correct[1] / n_samples,
                        tot_correct[2] / n_samples,
                        tot_correct[3] / n_samples,
                        tot_correct[4] / n_samples,
                        tot_correct[5] / n_samples,
                        current_time - saved_time))
                writer.add_scalar('Train/ClfLoss', tot_clf_loss / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/MMLoss', tot_mom_loss / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/s2Loss', tot_s2_loss / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/s3Loss', tot_s3_loss / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/Accu0', tot_correct[0] / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/Accu1', tot_correct[1] / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/Accu2', tot_correct[2] / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/Accu0_', tot_correct[3] / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/Accu1_', tot_correct[4] / n_samples, iteration * opt.bs)
                writer.add_scalar('Train/Accu2_', tot_correct[5] / n_samples, iteration * opt.bs)

                saved_weight = torch.FloatTensor([tot_correct[0], tot_correct[1], tot_correct[2], tot_correct[3], tot_correct[4], tot_correct[5]]).to(device)
                if torch.sum(saved_weight) == 0.:
                    saved_weight = torch.FloatTensor(6).to(device).fill_(1)/6.
                else:
                    saved_weight = saved_weight/torch.sum(saved_weight)
                
                saved_time = time.time()
                tot_clf_loss, tot_mom_loss, tot_correct, n_samples = 0, 0, [0, 0, 0, 0, 0, 0], 0
                tot_s2_loss, tot_s3_loss = 0, 0
                train_loss.append(tot_loss)

            # evaluation and save
            if iteration % opt.eval_interval == 0 and iteration >= 0 and target != 'real':
                print('weight = ', saved_weight.cpu().numpy())
                evalacc = eval(saved_weight, feature_extractor, classifier_1_, classifier_2_, classifier_3_,
                classifier_1, classifier_2, classifier_3, dataloader_tt)
                writer.add_scalar('Test/Accu', evalacc, iteration * opt.bs)
                acc_on_target.append(evalacc)
                print('Eval Acc = {:.2f}\n'.format(evalacc*100))
                torch.save({
                        'epoch': epoch,
                        'feature_extractor': feature_extractor.state_dict(),
                        '{}_classifier'.format(source1): classifier_1.state_dict(),
                        '{}_classifier'.format(source2): classifier_2.state_dict(),
                        '{}_classifier'.format(source3): classifier_3.state_dict(),
                        '{}_classifier_'.format(source1): classifier_1_.state_dict(),
                        '{}_classifier_'.format(source2): classifier_2_.state_dict(),
                        '{}_classifier_'.format(source3): classifier_3_.state_dict(),
                        'features_optimizer': optimizer_features.state_dict(),
                        'classifier_optimizer': optimizer_classifier.state_dict(),
                        'loss': tot_loss,
                        'saved_weight': saved_weight
               }, os.path.join(path_output, target + '-{}-{:.2f}.pth'.format(epoch, evalacc*100)))

            iteration += 1

    pkl.dump(train_loss, open('{}train_loss.p'.format(path_output), 'wb'))
    if target != 'real':
        pkl.dump(acc_on_target, open('{}target_accuracy.p'.format(path_output), 'wb'))
Beispiel #7
0
def main():
    training_size = 1  #10000
    valid_size = 1  #1000
    test_size = 15  #1000
    epochs_num = 1  #1000
    hidden_size = 60  #5
    batch_size = 1  #100
    data_length = 60

    train_x, train_t = mkDataSet(training_size)
    valid_x, valid_t = mkDataSet(valid_size)
    #print(valid_t)

    model = Predictor(2, hidden_size, 2)
    criterion = nn.MSELoss()
    optimizer = SGD(model.parameters(), lr=0.01)

    for epoch in range(epochs_num):
        # training
        running_loss = 0.0
        training_accuracy = 0.0
        for i in range(int(training_size / batch_size)):
            optimizer.zero_grad()  # 勾配の初期化

            data, label = mkRandomBatch(train_x, train_t, batch_size)

            output = model(data)  # 順伝播

            loss = criterion(output, label)  # ロスの計算
            loss.backward()  # 勾配の計算
            optimizer.step()  # パラメータの更新

            running_loss += loss.item()
            #training_accuracy += np.sum(np.abs((output.data - label.data).numpy()) < 0.1)
            #print(label.data)
            training_accuracy = mean_squared_error(np.ravel(output.data),
                                                   np.ravel(
                                                       label.data))  #MSEで誤差算出
            #print('MSE Train : %.3f' % training_accuracy)
        #valid
        test_accuracy = 0.0
        for i in range(int(valid_size / batch_size)):
            offset = i * batch_size
            data, label = torch.FloatTensor(
                valid_x[offset:offset + batch_size]), torch.FloatTensor(
                    valid_t[offset:offset + batch_size])
            output = model(data, None)

            #test_accuracy += np.sum(np.abs((output.data - label.data).numpy()) < 10)
            test_accuracy = mean_squared_error(np.ravel(output.data),
                                               np.ravel(label.data))

            #print(output.data)
            #print(label.data)

        #training_accuracy /= training_size
        #test_accuracy /= valid_size

        print('%d loss: %.3f, training_accuracy: %.5f, valid_accuracy: %.5f' %
              (epoch + 1, running_loss, training_accuracy, test_accuracy))

    #test
    test_accuracy = 0.0
    test_x, test_t = mkTestSet(test_size)
    result = []
    process = []
    for i in range(int(
            test_size /
            batch_size)):  #testではlabelが正解なので、output.data(出力)とlabel.dataを比較する
        offset = i * batch_size
        data, label = torch.FloatTensor(
            test_x[offset:offset + batch_size]), torch.FloatTensor(
                test_t[offset:offset + batch_size])
        output = model(data, None)
        test_accuracy = mean_squared_error(np.ravel(output.data),
                                           np.ravel(label.data))
        process = output.data.numpy().flatten()
        result.append(process)
    print('%d loss: %.3f, training_accuracy: %.5f, test_accuracy: %.5f' %
          (epoch + 1, running_loss, training_accuracy, test_accuracy))

    #print(test_x)
    #print(type(result))
    #print(result)
    #print(result)
    #data_np=result.numpy()
    data_np = np.asarray(result).flatten()
    #data_np[data_np % 2 == 0]=(data_np[data_np%2==0]+1)*1280/2
    print(len(data_np))
    print(data_np)
    data_np = np.resize(data_np, (test_size * data_length, 2))
    #print(data_np[:,([0]+1)*1280/2])
    #submission = pd.Series(data_np) #name=['x','y'])
    #submission.to_csv("C:\\Users\\010170243\\work\\seq2seq\\dataset\\kusakaGomiToCSV\\all\\kusaka_result.csv", header=True, index_label='id')

    np.savetxt(
        "C:\\Users\\010170243\\work\\seq2seq\\dataset\\kusakaGomiToCSV\\all\\kusaka_result.csv",  # ファイル名
        X=data_np,  # 保存したい配列
        delimiter=",",
        fmt='%.15f',
        header="x,y",  # 区切り文字
    )
Beispiel #8
0
class BaseModel(object):
    def __init__(self, n_ent, n_rel, args, struct):
        self.model = KGEModule(n_ent, n_rel, args, struct)
        self.model.cuda()

        self.n_ent = n_ent
        self.n_rel = n_rel
        self.time_tot = 0
        self.args = args

    def train(self, train_data, tester_val, tester_tst):
        head, tail, rela = train_data
        # useful information related to cache
        n_train = len(head)

        if self.args.optim == 'adam' or self.args.optim == 'Adam':
            self.optimizer = Adam(self.model.parameters(), lr=self.args.lr)
        elif self.args.optim == 'adagrad' or self.args.optim == 'Adagrad':
            self.optimizer = Adagrad(self.model.parameters(), lr=self.args.lr)
        else:
            self.optimizer = SGD(self.model.parameters(), lr=self.args.lr)

        scheduler = ExponentialLR(self.optimizer, self.args.decay_rate)

        n_epoch = self.args.n_epoch
        n_batch = self.args.n_batch
        best_mrr = 0

        # used for counting repeated triplets for margin based loss

        for epoch in range(n_epoch):
            start = time.time()

            self.epoch = epoch
            rand_idx = torch.randperm(n_train)
            head = head[rand_idx].cuda()
            tail = tail[rand_idx].cuda()
            rela = rela[rand_idx].cuda()

            epoch_loss = 0

            for h, t, r in batch_by_size(n_batch,
                                         head,
                                         tail,
                                         rela,
                                         n_sample=n_train):
                self.model.zero_grad()

                loss = self.model.forward(h, t, r)
                loss += self.args.lamb * self.model.regul
                loss.backward()
                self.optimizer.step()
                self.prox_operator()
                epoch_loss += loss.data.cpu().numpy()

            self.time_tot += time.time() - start
            scheduler.step()

            if (epoch + 1) % self.args.epoch_per_test == 0:
                # output performance
                valid_mrr, valid_mr, valid_10 = tester_val()
                test_mrr, test_mr, test_10 = tester_tst()
                out_str = '%.4f\t%.4f\t\t%.4f\t%.4f\n' % (valid_mrr, valid_10,
                                                          test_mrr, test_10)

                # output the best performance info
                if valid_mrr > best_mrr:
                    best_mrr = valid_mrr
                    best_str = out_str
                if best_mrr < self.args.thres:
                    print(
                        '\tearly stopped in Epoch:{}, best_mrr:{}'.format(
                            epoch + 1, best_mrr), self.model.struct)
                    return best_mrr, best_str
        return best_mrr, best_str

    def prox_operator(self, ):
        for n, p in self.model.named_parameters():
            if 'ent' in n:
                X = p.data.clone()
                Z = torch.norm(X, p=2, dim=1, keepdim=True)
                Z[Z < 1] = 1
                X = X / Z
                p.data.copy_(X.view(self.n_ent, -1))

    def test_link(self, test_data, head_filter, tail_filter):
        heads, tails, relas = test_data
        batch_size = self.args.test_batch_size
        num_batch = len(heads) // batch_size + int(len(heads) % batch_size > 0)

        head_probs = []
        tail_probs = []
        for i in range(num_batch):
            start = i * batch_size
            end = min((i + 1) * batch_size, len(heads))
            batch_h = heads[start:end].cuda()
            batch_t = tails[start:end].cuda()
            batch_r = relas[start:end].cuda()

            h_embed = self.model.ent_embed(batch_h)
            r_embed = self.model.rel_embed(batch_r)
            t_embed = self.model.ent_embed(batch_t)

            head_scores = torch.sigmoid(self.model.test_head(r_embed,
                                                             t_embed)).data
            tail_scores = torch.sigmoid(self.model.test_tail(h_embed,
                                                             r_embed)).data

            head_probs.append(head_scores.data.cpu().numpy())
            tail_probs.append(tail_scores.data.cpu().numpy())

        head_probs = np.concatenate(head_probs) * head_filter
        tail_probs = np.concatenate(tail_probs) * tail_filter
        head_ranks = cal_ranks(head_probs, label=heads.data.numpy())
        tail_ranks = cal_ranks(tail_probs, label=tails.data.numpy())
        h_mrr, h_mr, h_h10 = cal_performance(head_ranks)
        t_mrr, t_mr, t_h10 = cal_performance(tail_ranks)
        mrr = (h_mrr + t_mrr) / 2
        mr = (h_mr + t_mr) / 2
        h10 = (h_h10 + t_h10) / 2
        return mrr, mr, h10
Beispiel #9
0
def train(train_source_iter: ForeverDataIterator, train_target_iter: ForeverDataIterator, model: ImageClassifier,
          jmmd_loss: JointMultipleKernelMaximumMeanDiscrepancy, optimizer: SGD,
          lr_sheduler: StepwiseLR, epoch: int, args: argparse.Namespace):
    batch_time = AverageMeter('Time', ':4.2f')
    data_time = AverageMeter('Data', ':3.1f')
    losses = AverageMeter('Loss', ':3.2f')
    trans_losses = AverageMeter('Trans Loss', ':5.4f')
    cls_accs = AverageMeter('Cls Acc', ':3.1f')
    tgt_accs = AverageMeter('Tgt Acc', ':3.1f')

    progress = ProgressMeter(
        args.iters_per_epoch,
        [batch_time, data_time, losses, trans_losses, cls_accs, tgt_accs],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()
    jmmd_loss.train()

    end = time.time()
    for i in range(args.iters_per_epoch):
        lr_sheduler.step()

        # measure data loading time
        data_time.update(time.time() - end)

        x_s, labels_s = next(train_source_iter)
        x_t, labels_t = next(train_target_iter)

        x_s = x_s.to(device)
        x_t = x_t.to(device)
        labels_s = labels_s.to(device)
        labels_t = labels_t.to(device)

        # compute output
        x = torch.cat((x_s, x_t), dim=0)
        y, f = model(x)
        y_s, y_t = y.chunk(2, dim=0)
        f_s, f_t = f.chunk(2, dim=0)

        cls_loss = F.cross_entropy(y_s, labels_s)
        transfer_loss = jmmd_loss(
            (f_s, F.softmax(y_s, dim=1)),
            (f_t, F.softmax(y_t, dim=1))
        )
        loss = cls_loss + transfer_loss * args.trade_off

        cls_acc = accuracy(y_s, labels_s)[0]
        tgt_acc = accuracy(y_t, labels_t)[0]

        losses.update(loss.item(), x_s.size(0))
        cls_accs.update(cls_acc.item(), x_s.size(0))
        tgt_accs.update(tgt_acc.item(), x_t.size(0))
        trans_losses.update(transfer_loss.item(), x_s.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional RNN-CNN')
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        required=True)
    parser.add_argument('--cuda', action='store_true', help='using GPU')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        help='Number of hidden units in RNN')
    parser.add_argument('--tag_space',
                        type=int,
                        default=0,
                        help='Dimension of tag space')
    parser.add_argument('--num_layers',
                        type=int,
                        default=1,
                        help='Number of layers of RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=30,
                        help='Number of filters in CNN')
    parser.add_argument('--char_dim',
                        type=int,
                        default=30,
                        help='Dimension of Character embeddings')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--dropout',
                        choices=['std', 'variational'],
                        help='type of dropout',
                        required=True)
    parser.add_argument('--p_rnn',
                        nargs=2,
                        type=float,
                        required=True,
                        help='dropout rate for RNN')
    parser.add_argument('--p_in',
                        type=float,
                        default=0.33,
                        help='dropout rate for input embeddings')
    parser.add_argument('--p_out',
                        type=float,
                        default=0.33,
                        help='dropout rate for output layer')
    parser.add_argument('--schedule',
                        type=int,
                        help='schedule for learning rate decay')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'sskip', 'polyglot'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("NER")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    embedding = args.embedding
    embedding_path = args.embedding_dict

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, \
    chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner/", train_path, data_paths=[dev_path, test_path],
                                                                 embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    device = torch.device('cuda') if args.cuda else torch.device('cpu')

    data_train = conll03_data.read_data_to_tensor(train_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  pos_alphabet,
                                                  chunk_alphabet,
                                                  ner_alphabet,
                                                  device=device)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data_to_tensor(dev_path,
                                                word_alphabet,
                                                char_alphabet,
                                                pos_alphabet,
                                                chunk_alphabet,
                                                ner_alphabet,
                                                device=device)
    data_test = conll03_data.read_data_to_tensor(test_path,
                                                 word_alphabet,
                                                 char_alphabet,
                                                 pos_alphabet,
                                                 chunk_alphabet,
                                                 ner_alphabet,
                                                 device=device)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform_
    if args.dropout == 'std':
        network = BiRecurrentConv(embedd_dim,
                                  word_alphabet.size(),
                                  char_dim,
                                  char_alphabet.size(),
                                  num_filters,
                                  window,
                                  mode,
                                  hidden_size,
                                  num_layers,
                                  num_labels,
                                  tag_space=tag_space,
                                  embedd_word=word_table,
                                  p_in=p_in,
                                  p_out=p_out,
                                  p_rnn=p_rnn,
                                  initializer=initializer)
    else:
        network = BiVarRecurrentConv(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     num_filters,
                                     window,
                                     mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     tag_space=tag_space,
                                     embedd_word=word_table,
                                     p_in=p_in,
                                     p_out=p_out,
                                     p_rnn=p_rnn,
                                     initializer=initializer)

    network = network.to(device)

    lr = learning_rate
    # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma)
    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" %
        (mode, num_layers, hidden_size, num_filters, tag_space))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))

    num_batches = num_data / batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, args.dropout, lr, decay_rate, schedule))
        train_err = 0.
        train_corr = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, _, _, labels, masks, lengths = conll03_data.get_batch_tensor(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss, corr, _ = network.loss(
                word,
                char,
                labels,
                mask=masks,
                length=lengths,
                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
            loss.backward()
            optim.step()

            with torch.no_grad():
                num_tokens = masks.sum()
                train_err += loss * num_tokens
                train_corr += corr
                train_total += num_tokens

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total,
                    train_corr * 100 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' %
              (num_batches, train_err / train_total,
               train_corr * 100 / train_total, time.time() - start_time))

        # evaluate performance on dev data
        with torch.no_grad():
            network.eval()
            tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch)
            writer.start(tmp_filename)

            for batch in conll03_data.iterate_batch_tensor(
                    data_dev, batch_size):
                word, char, pos, chunk, labels, masks, lengths = batch
                _, _, preds = network.loss(
                    word,
                    char,
                    labels,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.cpu().numpy(),
                             pos.cpu().numpy(),
                             chunk.cpu().numpy(),
                             preds.cpu().numpy(),
                             labels.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            acc, precision, recall, f1 = evaluate(tmp_filename)
            print(
                'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))

            if dev_f1 < f1:
                dev_f1 = f1
                dev_acc = acc
                dev_precision = precision
                dev_recall = recall
                best_epoch = epoch

                # evaluate on test data when better performance detected
                tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch)
                writer.start(tmp_filename)

                for batch in conll03_data.iterate_batch_tensor(
                        data_test, batch_size):
                    word, char, pos, chunk, labels, masks, lengths = batch
                    _, _, preds = network.loss(
                        word,
                        char,
                        labels,
                        mask=masks,
                        length=lengths,
                        leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                    writer.write(word.cpu().numpy(),
                                 pos.cpu().numpy(),
                                 chunk.cpu().numpy(),
                                 preds.cpu().numpy(),
                                 labels.cpu().numpy(),
                                 lengths.cpu().numpy())
                writer.close()
                test_acc, test_precision, test_recall, test_f1 = evaluate(
                    tmp_filename)

            print(
                "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
            print(
                "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (test_acc, test_precision, test_recall, test_f1, best_epoch))

        if epoch % schedule == 0:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)
Beispiel #11
0
def train(
        cfg,
        img_size=416,
        resume=False,
        epochs=273,  # 500200 batches at bs 64, dataset length 117263
        batch_size=16,
        accumulate=1,
        multi_scale=False,
        freeze_backbone=False,
        num_workers=4,
        transfer=False  # Transfer learning (train only YOLO layers)
):
    weights = 'weights' + os.sep
    latest = weights + 'latest.pt'
    best = weights + 'best.pt'
    device = torch_utils.select_device()

    if multi_scale:
        img_size = 608  # initiate with maximum multi_scale size
    else:
        torch.backends.cudnn.benchmark = True  # unsuitable for multiscale

    # Initialize model
    model = Darknet(cfg, img_size).to(device)

    # Optimizer
    lr0 = 0.001  # initial learning rate
    optimizer = SGD(model.parameters(),
                    lr=lr0,
                    momentum=0.9,
                    weight_decay=0.0005)

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_loss = float('inf')
    yl = get_yolo_layers(model)  # yolo layers
    nf = int(model.module_defs[yl[0] -
                               1]['filters'])  # yolo layer size (i.e. 255)

    if resume:  # Load previously saved model
        if transfer:  # Transfer learning
            chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device)
            model.load_state_dict(
                {
                    k: v
                    for k, v in chkpt['model'].items()
                    if v.numel() > 1 and v.shape[0] != 255
                },
                strict=False)
            for p in model.parameters():
                p.requires_grad = True if p.shape[0] == nf else False
        else:  # resume from latest.pt
            chkpt = torch.load(latest, map_location=device)  # load checkpoint
            model.load_state_dict(chkpt['model'])

        start_epoch = chkpt['epoch'] + 1
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_loss = chkpt['best_loss']
        del chkpt
    else:  # Initialize model with backbone (optional)
        if '-tiny.cfg' in cfg:
            cutoff = load_darknet_weights(model,
                                          weights + 'yolov3-tiny.conv.15')
        else:
            cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

    # multi gpus
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=list(
                                          range(torch.cuda.device_count())))

    # Set scheduler
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[20, 40],
                                         gamma=0.1,
                                         last_epoch=start_epoch - 1)

    # Dataset
    # train_dataset = VOCDetection(root=os.path.join('~', 'data', 'VOCdevkit'), img_size=img_size, mode='train')
    train_dataset = DFSignDetection(root=os.path.join('~', 'data', 'dfsign',
                                                      'dfsign_chip_voc'),
                                    img_size=img_size,
                                    mode='train')

    # Dataloader
    dataloader = DataLoader(train_dataset,
                            batch_size=batch_size,
                            num_workers=num_workers,
                            shuffle=True,
                            pin_memory=True,
                            collate_fn=train_dataset.collate_fn)

    # Start training
    t = time.time()
    # model_info(model)
    nB = len(dataloader)
    n_burnin = nB  # burn-in batches
    for epoch in range(start_epoch, epochs):
        model.train()
        print(
            ('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf',
                                          'cls', 'total', 'nTargets', 'time'))

        # Update scheduler
        scheduler.step()

        # Freeze backbone at epoch 0, unfreeze at epoch 1
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        mloss = defaultdict(float)  # mean loss

        for i, (imgs, targets, _, paths) in enumerate(dataloader):
            imgs = imgs.to(device)
            targets = targets.to(device)

            nt = len(targets)
            if nt == 0:  # if no targets continue
                continue

            # SGD burn-in
            if epoch == 0 and i <= n_burnin:
                lr = lr0 * (i / n_burnin)**4
                for x in optimizer.param_groups:
                    x['lr'] = lr

            optimizer.zero_grad()
            # Run model
            pred = model(imgs)
            # Build targets
            target_list = build_targets(model, targets)
            # Compute loss
            loss, loss_dict = compute_loss(pred, target_list)
            loss.backward()

            # Accumulate gradient for x batches before optimizing
            if (i + 1) % accumulate == 0 or (i + 1) == nB:
                optimizer.step()

            # Running epoch-means of tracked metrics
            for key, val in loss_dict.items():
                mloss[key] = (mloss[key] * i + val) / (i + 1)

            s = ('%8s%12s' + '%10.3g' * 7) % (
                '%g/%g' % (epoch, epochs - 1), '%g/%g' %
                (i, nB - 1), mloss['xy'], mloss['wh'], mloss['conf'],
                mloss['cls'], mloss['total'], nt, time.time() - t)
            t = time.time()
            if i % 30 == 0:
                print(s)

            # Multi-Scale training (320 - 608 pixels) every 10 batches
            if multi_scale and (i + 1) % 10 == 0:
                dataset.img_size = random.choice(range(10, 20)) * 32
                print('multi_scale img_size = %g' % dataset.img_size)

        # Update best loss
        if mloss['total'] < best_loss:
            best_loss = mloss['total']

        # Save latest checkpoint
        checkpoint = {
            'epoch':
            epoch,
            'best_loss':
            best_loss,
            'model':
            model.module.state_dict()
            if type(model) is nn.parallel.DataParallel else model.state_dict(),
            'optimizer':
            optimizer.state_dict()
        }
        if epoch % 5 == 0:
            torch.save(checkpoint, 'weights/epoch_tt100k_%03d.pt' % epoch)

        # if epoch > 9 and epoch % 10 == 0:
        if False:
            with torch.no_grad():
                APs, mAP = test.test(cfg,
                                     weights=None,
                                     batch_size=32,
                                     img_size=img_size,
                                     model=model)
                pprint(APs)
                print(mAP)

        del checkpoint
class GoogLeNet(nn.Module):
    @staticmethod
    def init_weights(m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            truncated_normal_(m.weight)
            # nn.init.kaiming_normal_(
            #     tensor=m.weight,
            #     mode='fan_out',
            #     nonlinearity='relu'
            # )
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)

    def __init__(self, num_classes: int, enable_aux=False, conv_type=None):
        super(GoogLeNet, self).__init__()
        self.criterion = None
        self.optimizer = None
        self.scheduler = None
        self.enable_aux = enable_aux

        # Input size: 224x224x3 (RGB color space w/ zero mean)
        # Kernel size: 7x7
        # Padding size: 3x3
        # Stride = 2
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((224 - 7 + 2*3)/2) + 1 = 112
        #
        # Output size: 112x112x64
        self.conv1 = nn.Conv2d(
            in_channels=3, out_channels=64,
            kernel_size=(7, 7), stride=2, padding=(3, 3)
        )

        # Input size: 112x112x64
        # Kernel size: 3x3
        # Padding size: 1x1
        # Stride: 2
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((112 - 3 + 1*2)/2) + 1 = 56
        #
        # Output size: 56x56x64
        self.maxPooling1 = nn.MaxPool2d(
            kernel_size=(3, 3), stride=2, padding=(1, 1)
        )

        # Input size: 56x56x64
        # Kernel size: 1x1
        # Padding size: 0
        # Stride: 1
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((56 - 1 + 0)/1) + 1 = 56
        #
        # Output size: 56x56x64
        self.conv2_1 = nn.Conv2d(
            in_channels=64, out_channels=64,
            kernel_size=(1, 1), stride=1, padding=0
        )
        # Input size: 56x56x64
        # Kernel size: 3x3
        # Padding size: 1x1
        # Stride: 1
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((56 - 3 + 1*2)/1) + 1 = 56
        #
        # Output size: 56x56x192
        self.conv2_2 = nn.Conv2d(
            in_channels=64, out_channels=192,
            kernel_size=(3, 3), stride=1, padding=(1, 1)
        )

        # Input size: 56x56x192
        # Kernel size: 3x3
        # Padding size: 1x1
        # Stride: 2
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((56 - 3 + 1*2)/2) + 1 = 28
        #
        # Output size: 28x28x192
        self.maxPooling2 = nn.MaxPool2d(
            kernel_size=(3, 3), stride=2, padding=(1, 1)
        )

        self.inception_3a = Inception(
            in_channels=192,
            ch_1x1=64, ch_3x3_reduce=96, ch_3x3=128, ch_5x5_reduce=16, ch_5x5=32, pool_proj=32,
            conv_type=conv_type
        )
        self.inception_3b = Inception(
            in_channels=256,
            ch_1x1=128, ch_3x3_reduce=128, ch_3x3=192, ch_5x5_reduce=32, ch_5x5=96, pool_proj=64,
            conv_type=conv_type
        )

        # Input size: 28x28x480
        # Kernel size: 3x3
        # Padding size: 1x1
        # Stride: 2
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((28 - 3 + 1*2)/2) + 1 = 14
        #
        # Output size: 14x14x480
        self.maxPooling3 = nn.MaxPool2d(
            kernel_size=(3, 3), stride=2, padding=(1, 1)
        )

        self.inception_4a = Inception(
            in_channels=480,
            ch_1x1=192, ch_3x3_reduce=96, ch_3x3=208, ch_5x5_reduce=16, ch_5x5=48, pool_proj=64,
            conv_type=conv_type
        )
        self.inception_4b = Inception(
            in_channels=512,
            ch_1x1=160, ch_3x3_reduce=112, ch_3x3=224, ch_5x5_reduce=24, ch_5x5=64, pool_proj=64,
            conv_type=conv_type
        )
        self.inception_4c = Inception(
            in_channels=512,
            ch_1x1=128, ch_3x3_reduce=128, ch_3x3=256, ch_5x5_reduce=24, ch_5x5=64, pool_proj=64,
            conv_type=conv_type
        )
        self.inception_4d = Inception(
            in_channels=512,
            ch_1x1=112, ch_3x3_reduce=144, ch_3x3=288, ch_5x5_reduce=32, ch_5x5=64, pool_proj=64,
            conv_type=conv_type
        )
        self.inception_4e = Inception(
            in_channels=528,
            ch_1x1=256, ch_3x3_reduce=160, ch_3x3=320, ch_5x5_reduce=32, ch_5x5=128, pool_proj=128,
            conv_type=conv_type
        )

        # Input size: 14x14x832
        # Kernel size: 3x3
        # Padding size: 1x1
        # Stride: 2
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((14 - 3 + 1*2)/2) + 1 = 7
        #
        # Output size: 7x7x832
        self.maxPooling4 = nn.MaxPool2d(
            kernel_size=(3, 3), stride=2, padding=(1, 1)
        )

        self.inception_5a = Inception(
            in_channels=832,
            ch_1x1=256, ch_3x3_reduce=160, ch_3x3=320, ch_5x5_reduce=32, ch_5x5=128, pool_proj=128,
            conv_type=conv_type
        )
        self.inception_5b = Inception(
            in_channels=832,
            ch_1x1=384, ch_3x3_reduce=192, ch_3x3=384, ch_5x5_reduce=48, ch_5x5=128, pool_proj=128,
            conv_type=conv_type
        )

        # Input size: 7x7x1024
        # Kernel size: 7x7
        # Padding size: 0
        # Stride: 1
        # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((7 - 7 + 0)/2) + 1 = 1
        #
        # Output size: 1x1x1024
        self.avgPooling1 = nn.AvgPool2d(
            kernel_size=(3, 3), stride=2, padding=0
        )

        self.dropout = nn.Dropout(p=0.4, inplace=True)
        self.fc = nn.Linear(in_features=1024, out_features=num_classes, bias=True)

        if enable_aux:
            self.aux1 = AuxInception(in_channels=512, num_classes=num_classes)
            self.aux2 = AuxInception(in_channels=528, num_classes=num_classes)
        else:
            self.aux1 = None
            self.aux2 = None

    def initialize(self, criterion=None, optimizer=None, scheduler=None, weight_init=None, learning_rate=1e-2) -> None:
        if criterion is None:
            self.criterion = nn.CrossEntropyLoss()
        else:
            self.criterion = criterion

        if optimizer is None:
            self.optimizer = SGD(self.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4)
        else:
            self.optimizer = optimizer

        if scheduler is None:
            self.scheduler = torch.optim.lr_scheduler.StepLR(
                optimizer=self.optimizer,
                step_size=4, gamma=0.04
            )
        else:
            self.scheduler = scheduler

        if weight_init is None:
            for m in self.modules():
                m.apply(self.init_weights)
        else:
            for m in self.modules():
                m.apply(weight_init)

    def _forward(self, X: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
        # input: 224x224x3
        X = self.conv1(X)
        # input: 112x112x64
        X = self.maxPooling1(X)
        # input 56x56x64
        X = self.conv2_1(X)
        X = self.conv2_2(X)
        # input: 56x56x192
        X = self.maxPooling2(X)

        # input: 28x28x192
        X = self.inception_3a(X)
        # input: 28x28x256
        X = self.inception_3b(X)
        # input: 28x28x480
        X = self.maxPooling3(X)

        # input: 14x14x480
        X = self.inception_4a(X)

        aux1 = self.aux1(X) if (self.aux1 is not None) else None

        # input: 14x14x512
        X = self.inception_4b(X)
        # input: 14x14x512
        X = self.inception_4c(X)
        # input: 14x14x512
        X = self.inception_4d(X)

        aux2 = self.aux2(X) if (self.aux1 is not None) else None

        # input: 14x14x528
        X = self.inception_4e(X)
        # input: 14x14x528
        X = self.maxPooling4(X)

        # input: 7x7x832
        X = self.inception_5a(X)
        # input: 7x7x832
        X = self.inception_5b(X)
        # input: 7x7x1024
        X = self.avgPooling1(X)
        X = torch.flatten(X, 1)

        # input: 1x1x1024
        X = self.dropout(X)
        # input: 1x1x1000
        X = self.fc(X)

        # output: 1 x 1 x num_classes
        return X, aux1, aux2

    def forward(self, X:torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]]:
        X, aux1, aux2 = self._forward(X)
        if self.training and self.enable_aux:
            return X, aux2, aux1
        else:
            return X

    def train(self, mode=True, data=None, epochs=10) -> 'GoogLeNet':
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.to(device)

        if (data is None) and (mode):
            raise FileNotFoundError("\"data\" has to be a valid Dataloader object!")

        self.training = mode
        for m in self.modules():
            m.train(mode)

        if mode:
            running_loss = 0.0
            for epoch in range(0, epochs):
                for i, datum in enumerate(data, 0):
                    features, labels = datum[0].to(device), datum[1].to(device)
                    loss = self.criterion(self(features), labels)
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    running_loss += loss.item()
                    batch_split = int(len(data.dataset) / data.batch_size / 5)
                    batch_split = 1 if batch_split < 1 else batch_split
                    if i % batch_split == batch_split - 1:
                        if self.verbose:
                            print(f"[epoch {epoch + 1}, batch {i + 1}] loss: {running_loss / batch_split}")
                        self.scheduler.step(epoch)
                        running_loss = 0.0

            if self.verbose:
                print('Finished Training')

        return self
Beispiel #13
0
        valid_acc = 0.0
        model.train()
        if epoch in [30, 80]:
            lr /= 3.0
            for pg in opt.param_groups:
                pg['lr'] = lr
        for X, y in dataset:
            X, y = V(X), V(y)
            outputs = model(X)
            epoch_acc += accuracy(outputs, y)
            y = y.max(1)[1].view(-1).long()
            error = loss(outputs, y)
            epoch_error += error.data[0]
            opt.zero_grad()
            error.backward()
            opt.step()
        print('Train loss', epoch_error / len(dataset))
        print('Train accuracy', epoch_acc / (bsz * len(dataset)))
        model.eval()
        for X, y in validation:
            X, y = V(X, volatile=True), V(y, volatile=True)
            outputs = model(X)
            valid_acc += accuracy(outputs, y)
            y = y.max(1)[1].view(-1).long()
            error = loss(outputs, y)
            valid_error += error.data[0]
        print('Valid loss', valid_error / len(validation))
        print('Valid accuracy', valid_acc / (bsz * len(validation)))
        print('')

    # Save model weights
Beispiel #14
0
class MaskTrainer(BaseTrainer):
    def __init__(self, config):
        super(MaskTrainer, self).__init__(config)

        self.init_mask()

    def init_mask(self):
        if self.config.mask_type == 'icon':
            project_path = self.config.firelab.paths.project_path
            data_dir = os.path.join(project_path, self.config.data_dir)
            icon = imread(os.path.join(data_dir, self.config.hp.icon_file_path))
            if self.config.hp.get('should_resize_icon', False):
                icon = resize(icon, self.config.hp.target_icon_size, mode='constant', anti_aliasing=True)
            icon = convert_img_to_binary(icon)
            self.mask = make_mask_ternary(icon)
        elif self.config.mask_type == 'custom':
            self.mask = np.array(self.config.mask)
        elif self.config.mask_type == 'square':
            self.mask = generate_square_mask(self.config.hp.square_size)
            self.mask = make_mask_ternary(self.mask)
        elif self.config.mask_type == 'randomly_filled_square':
            self.mask = generate_square_mask(self.config.hp.square_size)
            self.mask = randomly_fill_square(self.mask, self.config.hp.fill_prob)
            self.mask = make_mask_ternary(self.mask)
        elif self.config.mask_type == 'square_grid':
            self.mask = generate_square_grid_mask(self.config.hp.n_good_cells)
            self.mask = make_mask_ternary(self.mask)
        else:
            raise NotImplementedError('Mask type %s is not supported' % self.config.mask_type)

    def init_dataloaders(self):
        dataset = self.config.hp.get('dataset', 'FashionMNIST')
        batch_size = self.config.hp.batch_size
        project_path = self.config.firelab.paths.project_path
        data_dir = os.path.join(project_path, self.config.data_dir)

        if dataset == 'FashionMNIST':
            data_train = FashionMNIST(data_dir, train=True, transform=transforms.ToTensor())
            data_test = FashionMNIST(data_dir, train=False, transform=transforms.ToTensor())
        elif dataset == 'MNIST':
            data_train = MNIST(data_dir, train=True, transform=transforms.ToTensor())
            data_test = MNIST(data_dir, train=False, transform=transforms.ToTensor())
        elif dataset == 'CIFAR10':
            train_transform = transforms.Compose([
                transforms.Pad(padding=4),
                transforms.RandomCrop(size=(32, 32)),
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.ToTensor(),
                transforms.RandomErasing(p=0.5, scale=(0.25, 0.25), ratio=(1., 1.)), # Cut out 8x8 square
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            ])
            test_transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            ])
            data_train = CIFAR10(data_dir, train=True, transform=train_transform)
            data_test = CIFAR10(data_dir, train=False, transform=test_transform)
        else:
            raise NotImplementedError(f"Unknown dataset: {dataset}")

        data_vis_train = Subset(data_train, random.sample(range(len(data_train)), self.config.get('n_points_for_vis', 1000)))
        data_vis_test = Subset(data_test, random.sample(range(len(data_test)), self.config.get('n_points_for_vis', 1000)))

        self.train_dataloader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
        self.val_dataloader = DataLoader(data_test, batch_size=batch_size, shuffle=False)
        self.vis_train_dataloader = DataLoader(data_vis_train, batch_size=batch_size, shuffle=False)
        self.vis_test_dataloader = DataLoader(data_vis_test, batch_size=batch_size, shuffle=False)

    def init_models(self):
        self.init_torch_model_builder()
        self.model = MaskModel(
            self.mask, self.torch_model_builder,
            should_center_origin=self.config.hp.should_center_origin,
            parametrization_type=self.config.hp.parametrization_type)
        self.model = self.model.to(self.device_name)

        # self.logger.info(f'Model initial orthogonality: {self.model.compute_ort_reg()}')
        # self.logger.info(f'Model params: {self.config.hp.conv_model_config.to_dict()}. Parametrization: {self.config.hp.parametrization_type}')

    def init_torch_model_builder(self):
        if self.config.hp.model_name == 'fast_resnet':
            self.torch_model_builder = lambda: FastResNet(
                n_classes=10, n_input_channels=self.config.hp.get('n_input_channels', 1)).nn
        elif self.config.hp.model_name == 'resnet18':
            self.torch_model_builder = lambda: ResNet18(
                n_classes=10, n_input_channels=self.config.hp.get('n_input_channels', 1)).nn
        elif self.config.hp.model_name == "vgg":
            self.torch_model_builder = lambda: VGG11(
                n_input_channels=self.config.hp.get('n_input_channels', 1),
                use_bn=self.config.hp.get('use_bn', True)).model
        elif self.config.hp.model_name == "simple":
            self.torch_model_builder = lambda: SimpleModel().nn
        elif self.config.hp.model_name == "conv":
            self.torch_model_builder = lambda: ConvModel(self.config.hp.conv_model_config).nn
        else:
            raise NotImplementedError(f"Model {self.config.hp.model_name} is not supported")

    def init_criterions(self):
        self.criterion = nn.CrossEntropyLoss(reduction='none')

    def init_optimizers(self):
        optim_type = self.config.hp.get('optim.type', 'adam').lower()

        if optim_type == 'adam':
            self.optim = Adam(self.model.parameters(), **self.config.hp.optim.kwargs.to_dict())
        elif optim_type == 'sgd':
            self.optim = SGD(self.model.parameters(), **self.config.hp.optim.kwargs.to_dict())
        else:
            raise NotImplementedError(f'Unknown optimizer: {optim_type}')

        if not self.config.hp.optim.has('scheduler'):
            self.scheduler = None
        elif self.config.hp.optim.get('scheduler.type') == 'triangle_lr':
            epoch_size = len(self.train_dataloader)
            self.scheduler = TriangleLR(self.optim, epoch_size, **self.config.hp.optim.scheduler.kwargs.to_dict())
        else:
            raise NotImplementedError(f"Unknown scheduler.type: {self.config.hp.optim.get('scheduler.type')}")

    def train_on_batch(self, batch):
        self.optim.zero_grad()

        x = batch[0].to(self.device_name)
        y = batch[1].to(self.device_name)

        good_losses = []
        good_accs = []
        bad_losses = []
        bad_accs = []

        good_idx = self.model.get_class_idx(1).tolist()
        bad_idx = self.model.get_class_idx(-1).tolist()

        num_good_points_to_use = min(len(good_idx), self.config.hp.num_good_cells_per_update)
        num_bad_points_to_use = min(len(bad_idx), self.config.hp.num_bad_cells_per_update)

        for i, j in random.sample(good_idx, num_good_points_to_use):
            preds = self.model.run_from_weights(self.model.compute_point(i,j), x)

            good_loss = self.criterion(preds, y).mean()
            good_losses.append(good_loss.item())
            good_loss /= num_good_points_to_use
            good_loss.backward() # To make the graph free

            good_accs.append((preds.argmax(dim=1) == y).float().mean().item())

        for i, j in random.sample(bad_idx, num_bad_points_to_use):
            preds = self.model.run_from_weights(self.model.compute_point(i,j), x)

            bad_loss = self.criterion(preds, y).mean()
            bad_losses.append(bad_loss.item())
            bad_loss = bad_loss.clamp(0, self.config.hp.neg_loss_clip_threshold)
            bad_loss /= num_bad_points_to_use
            bad_loss *= self.config.hp.get('negative_loss_coef', 1.)
            bad_loss *= -1 # To make it grow
            bad_loss.backward() # To make the graph free

            bad_accs.append((preds.argmax(dim=1) == y).float().mean().item())

        good_losses = np.array(good_losses)
        good_accs = np.array(good_accs)
        bad_losses = np.array(bad_losses)
        bad_accs = np.array(bad_accs)

        # Adding regularization
        if self.config.hp.parametrization_type != "up_orthogonal":
            ort = self.model.compute_ort_reg()
            norm_diff = self.model.compute_norm_reg()
            reg_loss = self.config.hp.ort_l2_coef * ort.pow(2) + self.config.hp.norm_l2_coef * norm_diff.pow(2)
            reg_loss.backward()

            self.writer.add_scalar('Reg/ort', ort.item(), self.num_iters_done)
            self.writer.add_scalar('Reg/norm_diff', norm_diff.item(), self.num_iters_done)

        clip_grad_norm_(self.model.parameters(), self.config.hp.grad_clip_threshold)
        self.optim.step()

        if not self.scheduler is None:
            self.scheduler.step()

        self.writer.add_scalar('good/train/loss', good_losses.mean().item(), self.num_iters_done)
        self.writer.add_scalar('good/train/acc', good_accs.mean().item(), self.num_iters_done)
        self.writer.add_scalar('bad/train/loss', bad_losses.mean().item(), self.num_iters_done)
        self.writer.add_scalar('bad/train/acc', bad_accs.mean().item(), self.num_iters_done)
        self.writer.add_scalar('diff/train/loss', good_losses.mean().item() - bad_losses.mean().item(), self.num_iters_done)
        self.writer.add_scalar('diff/train/acc', good_accs.mean().item() - bad_accs.mean().item(), self.num_iters_done)

        self.writer.add_scalar('Stats/lengths/right', self.model.right.norm(), self.num_iters_done)
        self.writer.add_scalar('Stats/lengths/up', self.model.up.norm(), self.num_iters_done)
        self.writer.add_scalar('Stats/grad_norms/origin', self.model.origin.grad.norm().item(), self.num_iters_done)
        self.writer.add_scalar('Stats/grad_norms/right_param', self.model.right_param.grad.norm().item(), self.num_iters_done)
        self.writer.add_scalar('Stats/grad_norms/up_param', self.model.up_param.grad.norm().item(), self.num_iters_done)
        self.writer.add_scalar('Stats/grad_norms/scaling', self.model.scaling_param.grad.norm().item(), self.num_iters_done)
        self.writer.add_scalar('Stats/scaling', self.model.scaling_param.item(), self.num_iters_done)

    def before_training_hook(self):
        self.plot_mask()
        self.save_mask()
        self.plot_all_weights_histograms()
        self.write_config()

    def after_training_hook(self):
        if self.is_explicitly_stopped:
            self.delete_logs() # So tensorboard does not lag
        else:
            self.visualize_minimum(self.vis_train_dataloader, 'train')
            self.visualize_minimum(self.vis_test_dataloader, 'test')

    def delete_logs(self):
        shutil.rmtree(self.config.firelab.paths.logs_path)
        self.writer.close()

    def compute_mask_scores(self, dataloader):
        pad = self.config.get('solution_vis.padding', 0)
        x_num_points = self.config.get('solution_vis.granularity.x', self.mask.shape[0] + 2 * pad)
        y_num_points = self.config.get('solution_vis.granularity.y', self.mask.shape[1] + 2 * pad)
        xs = np.linspace(-pad, self.mask.shape[0] + pad, x_num_points)
        ys = np.linspace(-pad, self.mask.shape[1] + pad, y_num_points)

        dummy_model = self.torch_model_builder().to(self.device_name)
        scores = [[self.compute_mask_score(x, y, dummy_model, dataloader) for y in ys] for x in xs]

        return xs, ys, scores

    def compute_mask_score(self, x, y, dummy_model, dataloader):
        w = self.model.compute_point(x, y, should_orthogonalize=True)

        return validate_weights(w, dataloader, dummy_model)

    def visualize_minimum(self, dataloader:DataLoader, subtitle:str):
        xs, ys, scores = self.compute_mask_scores(dataloader)
        self.save_minima_grid(scores, subtitle)
        fig = self.build_minimum_figure(xs, ys, scores, subtitle)
        self.writer.add_figure(f'Minimum_{subtitle}', fig, self.num_iters_done)

    def build_minimum_figure(self, xs, ys, scores, subtitle:str):
        X, Y = np.meshgrid(xs, ys)
        scores = np.array(scores)

        fig = plt.figure(figsize=(20, 4))

        plt.subplot(141)
        cntr = plt.contourf(X, Y, scores[:,:,0].T, cmap="RdBu_r", levels=np.linspace(0.3, 2.5, 30))
        plt.title(f'Loss [{subtitle}]')
        plt.colorbar(cntr)

        plt.subplot(142)
        cntr = plt.contourf(X, Y, scores[:,:,1].T, cmap="RdBu_r", levels=np.linspace(0.5, 0.9, 30))
        plt.title(f'Accuracy [{subtitle}]')
        plt.colorbar(cntr)

        plt.subplot(143)
        cntr = plt.contourf(X, Y, scores[:,:,0].T, cmap="RdBu_r", levels=100)
        plt.title(f'Loss [{subtitle}]')
        plt.colorbar(cntr)

        plt.subplot(144)
        cntr = plt.contourf(X, Y, scores[:,:,1].T, cmap="RdBu_r", levels=np.linspace(0, 1, 100))
        plt.title(f'Accuracy [{subtitle}]')
        plt.colorbar(cntr)

        return fig

    def validate(self):
        self.model.is_good_mode = True
        good_val_loss, good_val_acc = validate(self.model, self.train_dataloader, self.criterion)
        self.model.is_good_mode = False
        bad_val_loss, bad_val_acc = validate(self.model, self.train_dataloader, self.criterion)
        self.model.is_good_mode = True

        self.writer.add_scalar('good/val/loss', good_val_loss, self.num_epochs_done)
        self.writer.add_scalar('good/val/acc', good_val_acc, self.num_epochs_done)
        self.writer.add_scalar('bad/val/loss', bad_val_loss, self.num_epochs_done)
        self.writer.add_scalar('bad/val/acc', bad_val_acc, self.num_epochs_done)
        self.writer.add_scalar('diff/val/loss', good_val_loss - bad_val_loss, self.num_epochs_done)
        self.writer.add_scalar('diff/val/acc', good_val_acc - bad_val_acc, self.num_epochs_done)

        self.plot_all_weights_histograms()

        if self.num_epochs_done > self.config.get('val_acc_stop_threshold_num_warmup_epochs', -1):
            if good_val_acc < self.config.get('good_val_acc_stop_threshold', 0.):
                self.stop(f'Good val accuracy is too low (epoch #{self.num_epochs_done}): {good_val_acc}')
            elif bad_val_acc > self.config.get('bad_val_acc_stop_threshold', 1.):
                self.stop(f'Bad val accuracy is too high (epoch #{self.num_epochs_done}): {bad_val_acc}')
            else:
                pass

        if self.num_epochs_done > self.config.get('diff_threshold_num_warmup_epochs', -1):
            if good_val_acc - bad_val_acc < self.config.get('good_and_bad_val_acc_diff_threshold', float('-inf')):
                self.stop(f'Difference between good and val accuracies is too small '\
                          f'(epoch #{self.num_epochs_done}): {good_val_acc} - {bad_val_acc} = {good_val_acc - bad_val_acc}')

    def plot_mask(self):
        fig = plt.figure(figsize=(5, 5))
        mask_img = np.copy(self.mask)
        mask_img[mask_img == 2] = 0.5
        plt.imshow(mask_img, cmap='gray')
        self.writer.add_figure('Mask', fig, self.num_iters_done)

    def save_mask(self):
        save_path = os.path.join(self.config.firelab.paths.custom_data_path, 'mask.npy')
        np.save(save_path, self.mask)

    def plot_params_histograms(self, w, subtag:str):
        dummy_model = self.torch_model_builder()
        params = weight_to_param(w, param_sizes(dummy_model.parameters()))
        tags = ['Weights_histogram_{}/{}'.format(i, subtag) for i in range(len(params))]

        for tag, param in zip(tags, params):
            self.writer.add_histogram(tag, param, self.num_iters_done)

    def plot_all_weights_histograms(self):
        # TODO: we do not need histograms currently...
        # self.plot_params_histograms(self.model.origin + self.model.right, 'origin_right')
        # self.plot_params_histograms(self.model.origin + self.model.up, 'origin_up')
        # self.plot_params_histograms(self.model.origin + self.model.up + self.model.right, 'origin_up_right')
        pass

    def write_config(self):
        config_yml = yaml.safe_dump(self.config.to_dict())
        config_yml = config_yml.replace('\n', '  \n') # Because tensorboard uses markdown
        self.writer.add_text('Config', config_yml, self.num_iters_done)

    def save_minima_grid(self, scores, subtitle:str):
        save_path = os.path.join(self.config.firelab.paths.custom_data_path, f'minima_grid_{subtitle}.npy')
        np.save(save_path, scores)
class Policy(nn.Module):
    def __init__(self, env):
        # game params
        self.board_x, self.board_y = env.get_ub_board_size()
        self.action_size = env.n_actions
        self.n_inputs = env.n_inputs
        self.lr = args.lr
        self.env = env
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        super(Policy, self).__init__()
        self.conv1 = nn.Conv2d(self.n_inputs,
                               args.num_channels,
                               3,
                               stride=1,
                               padding=1).to(self.device)
        self.conv2 = nn.Conv2d(args.num_channels,
                               args.num_channels,
                               3,
                               stride=1,
                               padding=1).to(self.device)
        self.conv3 = nn.Conv2d(args.num_channels,
                               args.num_channels,
                               3,
                               stride=1).to(self.device)
        self.conv4 = nn.Conv2d(args.num_channels,
                               args.num_channels,
                               3,
                               stride=1).to(self.device)

        self.bn1 = nn.BatchNorm2d(args.num_channels).to(self.device)
        self.bn2 = nn.BatchNorm2d(args.num_channels).to(self.device)
        self.bn3 = nn.BatchNorm2d(args.num_channels).to(self.device)
        self.bn4 = nn.BatchNorm2d(args.num_channels).to(self.device)
        self.fc1 = nn.Linear(args.num_channels*(self.board_x - 4)*(self.board_y - 4) \
                             + env.agent_step_dim, 1024).to(self.device)
        self.fc_bn1 = nn.BatchNorm1d(1024).to(self.device)

        self.fc2 = nn.Linear(1024, 512).to(self.device)
        self.fc_bn2 = nn.BatchNorm1d(512).to(self.device)

        self.fc3 = nn.Linear(512, self.action_size).to(self.device)

        self.fc4 = nn.Linear(512, 1).to(self.device)

        self.entropies = 0
        self.pi_losses = AverageMeter()
        self.v_losses = AverageMeter()
        self.action_probs = [[], []]
        self.state_values = [[], []]
        self.rewards = [[], []]
        self.next_states = [[], []]
        if args.optimizer == 'adas':
            self.optimizer = Adas(self.parameters(), lr=self.lr)
        elif args.optimizer == 'adam':
            self.optimizer = Adam(self.parameters(), lr=self.lr)
        else:
            self.optimizer = SGD(self.parameters(), lr=self.lr)

    def forward(self, s, agent):
        #                                                           s: batch_size x n_inputs x board_x x board_y
        s = s.view(-1, self.n_inputs, self.board_x,
                   self.board_y)  # batch_size x n_inputs x board_x x board_y
        s = F.relu(self.bn1(
            self.conv1(s)))  # batch_size x num_channels x board_x x board_y
        s = F.relu(self.bn2(
            self.conv2(s)))  # batch_size x num_channels x board_x x board_y
        s = F.relu(self.bn3(self.conv3(
            s)))  # batch_size x num_channels x (board_x-2) x (board_y-2)
        s = F.relu(self.bn4(self.conv4(
            s)))  # batch_size x num_channels x (board_x-4) x (board_y-4)
        s = s.view(-1,
                   args.num_channels * (self.board_x - 4) * (self.board_y - 4))
        s = torch.cat((s, agent), dim=1)
        s = F.dropout(F.relu(self.fc1(s)),
                      p=args.dropout,
                      training=self.training)  # batch_size x 1024
        s = F.dropout(F.relu(self.fc2(s)),
                      p=args.dropout,
                      training=self.training)  # batch_size x 512

        pi = self.fc3(s)  # batch_size x action_size
        v = self.fc4(s)  # batch_size x 1

        return F.log_softmax(pi, dim=1), v  # torch.tanh(v)

    def step(self, obs, agent):
        """
        Returns policy and value estimates for given observations.
        :param obs: Array of shape [N] containing N observations.
        :return: Policy estimate [N, n_actions] and value estimate [N] for
        the given observations.
        """
        obs = torch.from_numpy(obs).to(self.device)
        agent = torch.from_numpy(agent).to(self.device)
        pi, v = self.forward(obs, agent)

        return torch.exp(pi).detach().to('cpu').numpy(), v.detach().to(
            'cpu').numpy()

    def store(self, player_ID, prob, state_value, reward):
        self.action_probs[player_ID].append(prob)
        self.state_values[player_ID].append(state_value)
        self.rewards[player_ID].append(reward)

    def clear(self):
        self.action_probs = [[], []]
        self.state_values = [[], []]
        self.rewards = [[], []]
        self.next_states = [[], []]
        self.entropies = 0

    def get_data(self):
        return self.action_probs, self.state_values, self.rewards

    def optimize(self):
        self.optimizer.step()

    def reset_grad(self):
        self.optimizer.zero_grad()

    def train_examples(self, examples):
        """
        examples: list of examples, each example is of form (board, pi, v)
        """

        for epoch in range(args.epochs):
            # print('\nEPOCH ::: ' + str(epoch + 1))
            self.train()

            batch_count = int(len(examples) / args.batch_size)

            t = tqdm(range(batch_count), desc='Training Net')
            for _ in t:
                sample_ids = np.random.randint(len(examples),
                                               size=args.batch_size)
                boards, agent_steps, pis, vs = list(
                    zip(*[examples[i] for i in sample_ids]))
                boards = self.env.get_states_for_step(boards)
                agent_steps = self.env.get_agents_for_step(agent_steps)
                boards = torch.FloatTensor(boards.astype(np.float64)).to(
                    self.device)
                agent_steps = torch.FloatTensor(agent_steps.astype(
                    np.float64)).to(self.device)
                target_pis = torch.FloatTensor(np.array(pis))
                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

                # predict
                if self.device == 'cuda':
                    boards, target_pis, target_vs = boards.contiguous().cuda(
                    ), target_pis.contiguous().cuda(), target_vs.contiguous(
                    ).cuda()

                # compute output
                out_pi, out_v = self.forward(boards, agent_steps)
                l_pi = self.loss_pi(target_pis, out_pi)
                l_v = self.loss_v(target_vs, out_v)
                total_loss = l_pi + l_v

                # record loss
                self.pi_losses.update(l_pi.item(), boards.size(0))
                self.v_losses.update(l_v.item(), boards.size(0))
                t.set_postfix(Loss_pi=self.pi_losses, Loss_v=self.v_losses)
                # compute gradient and do Adas step
                self.reset_grad()
                total_loss.backward()
                self.optimize()
        self.pi_losses.plot('PolicyLoss')
        self.v_losses.plot('ValueLoss')

    def loss_pi(self, targets, outputs):
        return -torch.sum(targets * outputs) / targets.size()[0]

    def loss_v(self, targets, outputs):
        return torch.sum((targets - outputs.view(-1))**2) / targets.size()[0]

    def save_checkpoint(self, folder='Models', filename='model.pt'):
        filepath = os.path.join(folder, filename)
        if not os.path.exists(folder):
            print("Checkpoint Directory does not exist! Making directory {}".
                  format(folder))
            os.mkdir(folder)
        else:
            print("Checkpoint Directory exists! ")
        torch.save({
            'state_dict': self.state_dict(),
        }, filepath)

    def load_checkpoint(self, folder='Models', filename='model.pt'):
        # https://github.com/pytorch/examples/blob/master/imagenet/main.py#L98
        filepath = os.path.join(folder, filename)
        if not os.path.exists(filepath):
            raise ("No model in path {}".format(filepath))
        checkpoint = torch.load(filepath, map_location=self.device)
        self.load_state_dict(checkpoint['state_dict'])
        print('-- Load model succesfull!')

    def load_colab_model(self, _dir):
        self.load_state_dict(torch.load(_dir, map_location=self.device))

    def save_colab_model(self, _dir):
        torch.save(self.state_dict(), _dir)
Beispiel #16
0
    def train(self,
              epochs=10,
              lr=0.003,
              save_model=True,
              save_dir='./static/models',
              testing=True):
        # Setup optimizers
        # IT is a little bit complicated, but to match caffe implementation, it must be like this.
        optimizer = SGD([
            {
                'params': self.model.module.conv1.weight
            },
            {
                'params': self.model.module.conv1.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.conv2.weight
            },
            {
                'params': self.model.module.conv2.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.conv3a.weight
            },
            {
                'params': self.model.module.conv3a.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.conv3b.weight
            },
            {
                'params': self.model.module.conv3b.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.conv4a.weight
            },
            {
                'params': self.model.module.conv4a.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.conv4b.weight
            },
            {
                'params': self.model.module.conv4b.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.conv5a.weight
            },
            {
                'params': self.model.module.conv5a.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.conv5b.weight
            },
            {
                'params': self.model.module.conv5b.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.fc6.weight
            },
            {
                'params': self.model.module.fc6.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.fc7.weight
            },
            {
                'params': self.model.module.fc7.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
            {
                'params': self.model.module.fc8.weight
            },
            {
                'params': self.model.module.fc8.bias,
                'lr': 2 * lr,
                'weight_decay': 0.0
            },
        ],
                        lr=lr,
                        momentum=0.9,
                        weight_decay=0.005)
        # summary
        log_dir = 'log/{}'.format(
            datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        summary = SummaryWriter(log_dir=log_dir,
                                comment='Training started at {}'.format(
                                    datetime.datetime.now()))
        # training
        train_steps_per_epoch = len(self.loaders[0])
        lr_sched = lr_scheduler.StepLR(optimizer,
                                       step_size=4 * train_steps_per_epoch,
                                       gamma=0.1)
        for i in range(epochs):
            # train
            self.model.train(True)
            categorical_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()
            pbar = tqdm(self.loaders[0])
            for data in pbar:
                num_iter += 1
                # get the inputs
                inputs, labels = data
                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                labels = Variable(labels.cuda())
                # Compute outputs
                outputs = self.model(inputs)
                # compute loss
                loss = nn.CrossEntropyLoss()(outputs, labels)
                categorical_loss = loss.detach().item()  # /steps_per_update
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                lr_sched.step()
                pbar.set_description(
                    'Epoch {}/{}, Iter {}, Loss: {:.8f}'.format(
                        i + 1, epochs, num_iter, categorical_loss))
                summary.add_scalars('Train loss', {'Loss': categorical_loss},
                                    global_step=i * train_steps_per_epoch +
                                    num_iter)

                if num_iter == train_steps_per_epoch:
                    if save_model:
                        if not os.path.exists(save_dir):
                            os.makedirs(save_dir)
                        torch.save(
                            self.model.module.state_dict(),
                            '{}/model_{:06d}.pt'.format(save_dir, i + 1))
                    if testing:
                        val_loss, top1, top5 = self.test()
                        summary.add_scalars(
                            'Validation performance', {
                                'Validation loss': val_loss,
                                'Top-1 accuracy': top1,
                                'Top-5 accuracy': top5,
                            }, i)
                        print(
                            'Epoch {}/{}: Top-1 accuracy {:.2f} %, Top-5 accuracy: {:.2f} %'
                            .format(i + 1, epochs, top1.item(), top5.item()))
                    break
class Trainer(object):
    """
    Trainer encapsulates all the logic necessary for
    training the Recurrent Attention Model.

    All hyperparameters are provided by the user in the
    config file.
    """
    def __init__(self, data_loader):
        """
        Construct a new Trainer instance.

        Args
        ----
        - config: object containing command line arguments.
        - data_loader: data iterator
        """
        # self.config = config

        # glimpse network params
        self.patch_size = 16
        self.glimpse_scale = 2
        self.num_patches = 3
        self.loc_hidden = 128
        self.glimpse_hidden = 128

        # core network params
        self.num_glimpses = 6
        self.hidden_size = 256

        # reinforce params
        self.std = 0.17
        self.M = 10

        # data params

        self.train_loader = data_loader[0]
        self.valid_loader = data_loader[1]
        self.num_train = len(self.train_loader.sampler.indices)
        self.num_valid = len(self.valid_loader.sampler.indices)


        self.num_classes = 27
        self.num_channels = 3

        # training params
        self.epochs = 25
        self.start_epoch = 0
        self.saturate_epoch = 150
        self.init_lr = 0.001
        self.min_lr = 1e-06
        self.decay_rate = (self.min_lr - self.init_lr) / (self.saturate_epoch)
        self.momentum = 0.5
        self.lr = self.init_lr

        # misc params
        self.use_gpu = False
        self.best = True
        # self.ckpt_dir = config.ckpt_dir
        # self.logs_dir = config.logs_dir
        self.best_valid_acc = 0.
        self.counter = 0
        # self.patience = config.patience
        # self.use_tensorboard = config.use_tensorboard
        # self.resume = config.resume
        # self.print_freq = config.print_freq
        # self.plot_freq = config.plot_freq


        # self.plot_dir = './plots/' + self.model_name + '/'
        # if not os.path.exists(self.plot_dir):
        #     os.makedirs(self.plot_dir)

        # configure tensorboard logging


        # build RAM model
        self.model = RecurrentAttention(
            self.patch_size, self.num_patches, self.glimpse_scale,
            self.num_channels, self.loc_hidden, self.glimpse_hidden,
            self.std, self.hidden_size, self.num_classes,
        )
        if self.use_gpu:
            self.model.cuda()

        print('[*] Number of model parameters: {:,}'.format(
            sum([p.data.nelement() for p in self.model.parameters()])))

        # initialize optimizer and scheduler
        self.optimizer = SGD(
            self.model.parameters(), lr=self.lr, momentum=self.momentum,
        )
        self.scheduler = ReduceLROnPlateau(self.optimizer, 'min')

    def reset(self):
        """
        Initialize the hidden state of the core network
        and the location vector.

        This is called once every time a new minibatch
        `x` is introduced.
        """
        h_t = torch.zeros(self.batch_size, self.hidden_size)
        h_t = Variable(h_t)

        l_t = torch.Tensor(self.batch_size, 2).uniform_(-1, 1)
        l_t = Variable(l_t)

        return h_t, l_t

    def train(self):
        """
        Train the model on the training set.

        A checkpoint of the model is saved after each epoch
        and if the validation accuracy is improved upon,
        a separate ckpt is created for use on the test set.
        """
        # load the most recent checkpoint
        # if self.resume:
        #     self.load_checkpoint(best=False)

        print("\n[*] Train on {} samples, validate on {} samples".format(
            self.num_train, self.num_valid)
        )

        for epoch in range(self.epochs):

            print(
                '\nEpoch: {}/{} - LR: {:.6f}'.format(
                    epoch+1, self.epochs, self.lr)
            )

            # train for 1 epoch
            train_loss, train_acc = self.train_one_epoch(epoch)

            # evaluate on validation set
            valid_loss, valid_acc = self.validate(epoch)

            # self.scheduler.step(valid_loss)
            #
            # # # decay learning rate
            # # if epoch < self.saturate_epoch:
            # #     self.anneal_learning_rate(epoch)
            #
            is_best = valid_acc > self.best_valid_acc

            msg1 = "train loss: {:.3f} - train acc: {:.3f} "
            msg2 = "- val loss: {:.3f} - val acc: {:.3f}"
            if is_best:
                msg2 += " [*]"
            msg = msg1 + msg2
            print(msg.format(train_loss, train_acc, valid_loss, valid_acc))

            # # check for improvement
            # if not is_best:
            #     self.counter += 1
            # if self.counter > self.patience:
            #     print("[!] No improvement in a while, stopping training.")
            #     return
            # self.best_valid_acc = max(valid_acc, self.best_valid_acc)
            # self.save_checkpoint(
            #     {'epoch': epoch + 1, 'state_dict': self.model.state_dict(),
            #      'best_valid_acc': self.best_valid_acc,
            #      'lr': self.lr}, is_best
            # )

    def train_one_epoch(self, epoch):
        """
        Train the model for 1 epoch of the training set.

        An epoch corresponds to one full pass through the entire
        training set in successive mini-batches.

        This is used by train() and should not be called manually.
        """
        batch_time = AverageMeter()
        losses = AverageMeter()
        accs = AverageMeter()

        tic = time.time()

        for i, (x, y) in enumerate(self.train_loader):
            if self.use_gpu:
                x, y = x.cuda(), y.cuda()
            x, y = Variable(x), Variable(y)

            # plot = False
            # if (epoch % self.plot_freq == 0) and (i == 0):
            #     plot = True

            # initialize location vector and hidden state
            self.batch_size = x.shape[0]
            h_t, l_t = self.reset()

            # save images
            # imgs = []
            # imgs.append(x[0:9])

            # extract the glimpses
            locs = []
            log_pi = []
            baselines = []
            for t in range(self.num_glimpses - 1):

                # forward pass through model
                h_t, l_t, b_t, p = self.model(x, l_t, h_t)

                # store
                locs.append(l_t[0:9])
                baselines.append(b_t)
                log_pi.append(p)

            # last iteration
            h_t, l_t, b_t, log_probas, p = self.model(
                x, l_t, h_t, last=True
            )
            log_pi.append(p)
            baselines.append(b_t)
            # locs.append(l_t[0:9])

            # convert list to tensors and reshape
            baselines = torch.stack(baselines).transpose(1, 0)
            log_pi = torch.stack(log_pi).transpose(1, 0)

            # calculate reward
            predicted = torch.max(log_probas, 1)[1]
            R = (predicted.detach() == y).float()
            R = R.unsqueeze(1).repeat(1, self.num_glimpses)

            # compute losses for differentiable modules
            loss_action = F.nll_loss(log_probas, y)
            loss_baseline = F.mse_loss(baselines, R)

            # compute reinforce loss
            adjusted_reward = R - baselines.detach()
            loss_reinforce = torch.mean(-log_pi*adjusted_reward)

            # sum up into a hybrid loss
            loss = loss_action + loss_baseline + loss_reinforce

            # compute accuracy
            correct = (predicted == y).float()
            acc = 100 * (correct.sum() / len(y))

            # store
            losses.update(loss.item(), x.size()[0])
            accs.update(acc.item(), x.size()[0])

            # a = list(self.model.sensor.parameters())[0].clone()
            # self.optimizer.zero_grad()
            # loss_reinforce.backward()
            # self.optimizer.step()
            # b = list(self.model.sensor.parameters())[0].clone()
            # print("Same: {}".format(torch.equal(a.data, b.data)))

            # compute gradients and update SGD
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # measure elapsed time
            toc = time.time()
            batch_time.update(toc-tic)

            # print("{:.1f}s - loss: {:.3f} - acc: {:.3f}".format(
            #             (toc-tic), loss.data[0], acc.data[0]
            #             ))




        return losses.avg, accs.avg

    def validate(self, epoch):
        """
        Evaluate the model on the validation set.
        """
        losses = AverageMeter()
        accs = AverageMeter()

        for i, (x, y) in enumerate(self.valid_loader):
            if self.use_gpu:
                x, y = x.cuda(), y.cuda()
            x, y = Variable(x), Variable(y)

            # duplicate 10 times
            x = x.repeat(self.M, 1, 1, 1)

            # initialize location vector and hidden state
            self.batch_size = x.shape[0]
            h_t, l_t = self.reset()

            # extract the glimpses
            log_pi = []
            baselines = []
            for t in range(self.num_glimpses - 1):

                # forward pass through model
                h_t, l_t, b_t, p = self.model(x, l_t, h_t)

                # store
                baselines.append(b_t)
                log_pi.append(p)

            # last iteration
            h_t, l_t, b_t, log_probas, p = self.model(
                x, l_t, h_t, last=True
            )
            log_pi.append(p)
            baselines.append(b_t)

            # convert list to tensors and reshape
            baselines = torch.stack(baselines).transpose(1, 0)
            log_pi = torch.stack(log_pi).transpose(1, 0)

            # average
            log_probas = log_probas.view(
                self.M, -1, log_probas.shape[-1]
            )
            log_probas = torch.mean(log_probas, dim=0)

            baselines = baselines.contiguous().view(
                self.M, -1, baselines.shape[-1]
            )
            baselines = torch.mean(baselines, dim=0)

            log_pi = log_pi.contiguous().view(
                self.M, -1, log_pi.shape[-1]
            )
            log_pi = torch.mean(log_pi, dim=0)

            # calculate reward
            predicted = torch.max(log_probas, 1)[1]
            R = (predicted.detach() == y).float()
            R = R.unsqueeze(1).repeat(1, self.num_glimpses)

            # compute losses for differentiable modules
            loss_action = F.nll_loss(log_probas, y)
            loss_baseline = F.mse_loss(baselines, R)

            # compute reinforce loss
            adjusted_reward = R - baselines.detach()
            loss_reinforce = torch.mean(-log_pi*adjusted_reward)

            # sum up into a hybrid loss
            loss = loss_action + loss_baseline + loss_reinforce

            # compute accuracy
            correct = (predicted == y).float()
            acc = 100 * (correct.sum() / len(y))

            # store
            losses.update(loss.item(), x.size()[0])
            accs.update(acc.item(), x.size()[0])


        return losses.avg, accs.avg

    def test(self, loader):
        """
        Test the model on the held-out test data.
        This function should only be called at the very
        end once the model has finished training.
        """
        correct = 0
        self.test_loader = loader
        # load the best checkpoint
        # self.load_checkpoint(best=self.best)
        self.num_test = len(self.test_loader.dataset)

        for i, (x, y) in enumerate(self.test_loader):
            # if self.use_gpu:
            #     x, y = x.cuda(), y.cuda()
            x, y = Variable(x), Variable(y)

            # duplicate 10 times
            x = x.repeat(self.M, 1, 1, 1)

            # initialize location vector and hidden state
            self.batch_size = x.shape[0]
            h_t, l_t = self.reset()

            # extract the glimpses
            for t in range(self.num_glimpses - 1):

                # forward pass through model
                h_t, l_t, b_t, p = self.model(x, l_t, h_t)

            # last iteration
            h_t, l_t, b_t, log_probas, p = self.model(
                x, l_t, h_t, last=True
            )

            log_probas = log_probas.view(
                self.M, -1, log_probas.shape[-1]
            )
            log_probas = torch.mean(log_probas, dim=0)

            pred = log_probas.data.max(1, keepdim=True)[1]
            correct += pred.eq(y.data.view_as(pred)).cpu().sum()

        perc = (100. * correct) / (self.num_test)
        print(
            '[*] Test Acc: {}/{} ({:.2f}%)'.format(
                correct, self.num_test, perc)
        )

    def anneal_learning_rate(self, epoch):
        """
        This function linearly decays the learning rate
        to a predefined minimum over a set amount of epochs.
        """
        self.lr += self.decay_rate

        # log to tensorboard
        if self.use_tensorboard:
            log_value('learning_rate', self.lr, epoch)

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.lr

    def save_checkpoint(self, state, is_best):
        """
        Save a copy of the model so that it can be loaded at a future
        date. This function is used when the model is being evaluated
        on the test data.

        If this model has reached the best validation accuracy thus
        far, a seperate file with the suffix `best` is created.
        """
        # print("[*] Saving model to {}".format(self.ckpt_dir))

        filename = self.model_name + '_ckpt.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        torch.save(state, ckpt_path)

        if is_best:
            filename = self.model_name + '_model_best.pth.tar'
            shutil.copyfile(
                ckpt_path, os.path.join(self.ckpt_dir, filename)
            )

    def load_checkpoint(self, best=False):
        """
        Load the best copy of a model. This is useful for 2 cases:

        - Resuming training with the most recent model checkpoint.
        - Loading the best validation model to evaluate on the test data.

        Params
        ------
        - best: if set to True, loads the best model. Use this if you want
          to evaluate your model on the test data. Else, set to False in
          which case the most recent version of the checkpoint is used.
        """
        print("[*] Loading model from {}".format(self.ckpt_dir))

        filename = self.model_name + '_ckpt.pth.tar'
        if best:
            filename = self.model_name + '_model_best.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        ckpt = torch.load(ckpt_path)

        # load variables from checkpoint
        self.start_epoch = ckpt['epoch']
        self.best_valid_acc = ckpt['best_valid_acc']
        self.lr = ckpt['lr']
        self.model.load_state_dict(ckpt['state_dict'])

        if best:
            print(
                "[*] Loaded {} checkpoint @ epoch {} "
                "with best valid acc of {:.3f}".format(
                    filename, ckpt['epoch']+1, ckpt['best_valid_acc'])
            )
        else:
            print(
                "[*] Loaded {} checkpoint @ epoch {}".format(
                    filename, ckpt['epoch']+1)
            )
Beispiel #18
0
def train(config):
    loader = DataLoader(TrainEvalDataset(
        config.dataset(split='train', **config.dataset_parameter), config),
                        config.batch_size,
                        True,
                        num_workers=num_processor)
    test_loader = DataLoader(TrainEvalDataset(
        config.dataset(split='test', **config.dataset_parameter), config),
                             config.batch_size,
                             False,
                             num_workers=num_processor)
    net = NetModel(config.net)
    loss_calculator = LossCalculator(config.net.loss)
    # net = nn.DataParallel(net)
    logger.info(config.net.pre_train)
    logger.info(type(config.net.pre_train))
    if config.net.pre_train is not None and os.path.exists(
            config.net.pre_train):
        unused, unused1 = net.load_state_dict(
            {(('base_net.' + k) if not k.startswith('base_net') else k): v
             for k, v in torch.load(config.net.pre_train).items()},
            strict=False)
        logger.info(unused)
        logger.info(unused1)
    net = net.to(device)
    optimizer = SGD(net.parameters(), config.lr, 0.9, weight_decay=0.0005)
    # optimizer = Adam(net.parameters(), config.lr, weight_decay=0.0005)
    exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer, 0.30)

    storage_dict = SqliteDict(f'{config.output_dir}/dcl_snap.db')
    start_epoach = 0
    if len(storage_dict) > 0:
        kk = list(storage_dict.keys())
        # net.load_state_dict(
        #     torch.load(BytesIO(storage_dict[38])))
        net.load_state_dict(torch.load(BytesIO(storage_dict[kk[-1]])))
        start_epoach = int(kk[-1]) + 1
        logger.info(f'loading from epoach{start_epoach}')
    global_step = 0
    for epoach in (range(start_epoach, config.max_it)):
        net.train()
        for batch_cnt, batch in tqdm(enumerate(loader), total=len(loader)):
            image, label = batch
            if isinstance(image, torch.Tensor):
                image = image.to(device)
            elif isinstance(image, dict):
                for k, v in image.items():
                    if isinstance(v, torch.Tensor):
                        image[k] = image[k].to(device)
            elif isinstance(image, list):
                for v in image:
                    v.to(device)
            for k, v in label.items():
                if isinstance(v, torch.Tensor):
                    label[k] = label[k].to(device)
            optimizer.zero_grad()
            net_out = net(image)
            loss_sum, loss_map = loss_calculator(net_out, label)
            loss_sum.backward()
            optimizer.step()
            global_step += 1
            wtire_summary(loss_map, 'train', global_step)
        exp_lr_scheduler.step(epoach)
        logger.debug(f'saving epoach {epoach}')
        buffer = BytesIO()
        torch.save(net.state_dict(), buffer)
        buffer.seek(0)
        storage_dict[epoach] = buffer.read()
        storage_dict.commit()
        test(config, net, test_loader, epoach, loss_calculator)
        next_pred, _ = n_copy(input, (h.detach(), c.detach()))

    target = input[0, 0, 0].detach() + gamma * next_pred.detach().item()

    # print(gt_target, target)
    real_error = (target - value_prediction)**2
    gt_error = (gt_target - value_prediction)**2
    if sum_of_error is None:
        sum_of_error = real_error
    else:
        sum_of_error = sum_of_error + real_error
    running_error = running_error * 0.9999 + gt_error.detach().item() * 0.0001
    if (i % args["truncation"] == 0):
        opti.zero_grad()
        sum_of_error.backward()
        opti.step()
        h = h.detach()
        c = c.detach()
        sum_of_error = None

    if (i % 50000 == 20000):
        error_list.append([str(rank), str(i), str(running_error)])
    if (i % 100000 == 4):
        my_experiment.insert_values("predictions", predictions_table_keys,
                                    predictions_list)
        predictions_list = []
        my_experiment.insert_values("error_table", error_table_keys,
                                    error_list)
        error_list = []

    if (i % 100000 == 0):
Beispiel #20
0
    batch_size = 256
    trans_mnist = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
    train_dataset = MNIST('./data/mnist/', train=True, download=True, transform=trans_mnist)
    test_dataset = MNIST('./data/mnist/', train=False, download=True, transform=trans_mnist)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = LeNet().to(args.device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cross_error = CrossEntropyLoss()
    epoch = 100

    writer = SummaryWriter('./runs/t_centerlize')
    for _epoch in range(epoch):
        epoch_loss = []
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x, train_label = train_x.to(args.device), train_label.to(args.device)
            #label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            _error = cross_error(predict_y, train_label.long())
            _error.backward()
            sgd.step()
            epoch_loss.append(_error)
        avg_epoch = sum(epoch_loss) / len(epoch_loss)
        writer.add_scalar("train_loss", avg_epoch, _epoch)
        print('Round {:3d}, Average loss {:.3f}'.format(_epoch, avg_epoch))
        
    acc_test, loss_test = test_img(model, test_dataset, args)
    print("Testing accuracy: {:.2f}".format(acc_test))
Beispiel #21
0
class LunaTrainingApp():
    def __init__(self, sys_argv=None):
        if sys_argv is None:
            sys_argv = sys.argv[1:]

        parser = argparse.ArgumentParser()
        parser.add_argument('--batch-size',
                            help="Batch size to use for training",
                            default=32,
                            type=int)
        parser.add_argument(
            '--num-workers',
            help="Number of worker processes for background data loading",
            default=8,
            type=int)
        parser.add_argument('--epochs',
                            help="Number of epochs to train for",
                            default=1,
                            type=int)

        self.cli_args = parser.parse_args(sys_argv)
        self.time_str = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    def main(self):
        log.info("Starting {}, {}".format(type(self).__name__, self.cli_args))

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")

        self.model = LunaModel()
        if self.use_cuda:
            if torch.cuda.device_count() > 1:
                self.model = nn.DataParallel(self.model)

            self.model = self.model.to(self.device)
        self.optimizer = SGD(self.model.parameters(), lr=0.01, momentum=0.9)

        train_dl = DataLoader(
            LunaDataset(test_stride=10, isTestSet_bool=False),
            batch_size=self.cli_args.batch_size *
            (torch.cuda.device_count() if self.use_cuda else 1),
            num_workers=self.cli_args.num_workers,
            pin_memory=self.use_cuda)

        test_dl = DataLoader(
            LunaDataset(test_stride=10, isTestSet_bool=True),
            batch_size=self.cli_args.batch_size *
            (torch.cuda.device_count() if self.use_cuda else 1),
            num_workers=self.cli_args.num_workers,
            pin_memory=self.use_cuda)

        for epoch_ndx in range(1, self.cli_args.epochs + 1):
            log.info("Epoch {} of {}, {}/{} batches of size {}*{}".format(
                epoch_ndx, self.cli_args.epochs, len(train_dl), len(test_dl),
                self.cli_args.batch_size,
                (torch.cuda.device_count() if self.use_cuda else 1)))

            # Trainig loop, very similar to below
            self.model.train()
            trainingMetrics_tensor = torch.zeros(3, len(train_dl.dataset), 1)
            batch_iter = enumerateWithEstimate(train_dl,
                                               "E{} Traning".format(epoch_ndx),
                                               start_ndx=train_dl.num_workers)

            for batch_ndx, batch_tup in batch_iter:
                self.optimizer.zero_grad()
                loss_var = self.computeBatchLoss(batch_ndx, batch_tup,
                                                 train_dl.batch_size,
                                                 trainingMetrics_tensor)
                loss_var.backward()
                self.optimizer.step()
                del loss_var

            # Testing loop, very similar to above, but simplified
            with torch.no_grad():
                self.model.eval()
                testingMetrics_tensor = torch.zeros(3, len(test_dl.dataset), 1)
                batch_iter = enumerateWithEstimate(
                    test_dl,
                    "E{} Testing".format(epoch_ndx),
                    start_ndx=test_dl.num_workers)
                for batch_ndx, batch_tup in batch_iter:
                    self.computeBatchLoss(batch_ndx, batch_tup,
                                          test_dl.batch_size,
                                          testingMetrics_tensor)

                self.logMetrics(epoch_ndx, trainingMetrics_tensor,
                                testingMetrics_tensor)

    def computeBatchLoss(self, batch_ndx, batch_tup, batch_size,
                         metrics_tensor):
        input_tensor, label_tensor, _series_list, _center_list = batch_tup
        input_devtensor = input_tensor.to(self.device)
        label_devtensor = label_tensor.to(self.device)

        prediction_devtensor = self.model(input_devtensor)
        loss_devsensor = nn.MSELoss(reduction='none')(prediction_devtensor,
                                                      label_devtensor)

        start_ndx = batch_ndx * batch_size
        end_ndx = start_ndx + label_tensor.size(0)

        metrics_tensor[METRICS_LABEL_NDX, start_ndx:end_ndx] = label_tensor
        metrics_tensor[METRICS_PRED_NDX, start_ndx:end_ndx] = \
            prediction_devtensor.to('cpu')
        metrics_tensor[METRICS_LOSS_NDX, start_ndx:end_ndx] = \
            loss_devsensor

        return loss_devsensor.mean()

    def logMetrics(self,
                   epoch_ndx,
                   trainingMetrics_tensor,
                   testingMetrics_tensor,
                   classificationThreshold_float=0.5):
        log.info("E{} {}".format(epoch_ndx, type(self).__name__))

        for mode_str, metrics_tensor in [('trn', trainingMetrics_tensor),
                                         ('tst', testingMetrics_tensor)]:
            metrics_ary = metrics_tensor.detach().numpy()[:, :, 0]
            assert np.isfinite(metrics_ary).all()

            benLabel_mask = metrics_ary[METRICS_LABEL_NDX] <= \
                classificationThreshold_float
            benPred_mask = metrics_ary[METRICS_PRED_NDX] <= \
                classificationThreshold_float

            malLabel_mask = ~benLabel_mask
            malPred_mask = ~benPred_mask

            benLabel_count = benLabel_mask.sum()
            malLabel_count = malLabel_mask.sum()

            benCorrect_count = (benLabel_mask & benPred_mask).sum()
            malCorrect_count = (malLabel_mask & malPred_mask).sum()

            metrics_dict = {}

            metrics_dict['loss/all'] = metrics_ary[METRICS_LOSS_NDX].mean()
            metrics_dict['loss/ben'] = metrics_ary[METRICS_LOSS_NDX,
                                                   benLabel_mask].mean()
            metrics_dict['loss/mal'] = metrics_ary[METRICS_LOSS_NDX,
                                                   malLabel_mask].mean()

            metrics_dict['correct/all'] = (malCorrect_count + benCorrect_count) \
                / metrics_ary.shape[1] * 100
            metrics_dict[
                'correct/ben'] = benCorrect_count / benLabel_count * 100
            metrics_dict[
                'correct/mal'] = malCorrect_count / malLabel_count * 100

            log.info(("E{} {:8} {loss/all:.4f} loss, {correct/all:-5.1f}% "
                      "correct").format(epoch_ndx, mode_str, **metrics_dict))
            log.info(("E{} {:8} {loss/ben:.4f} loss, {correct/ben:-5.1f}% "
                      "correct").format(epoch_ndx, mode_str + '_ben',
                                        **metrics_dict))
            log.info(("E{} {:8} {loss/mal:.4f} loss, {correct/mal:-5.1f}% "
                      "correct").format(epoch_ndx, mode_str + 'mal',
                                        **metrics_dict))
Beispiel #22
0
def train_class(directory,
                version,
                model,
                train_loader,
                valid_loader,
                resize,
                batch_size,
                exp_name='experiment',
                lr=0.01,
                epochs=10,
                momentum=0.99,
                logdir='logs',
                dizionario=None):
    print("Taining classifacation")
    criterion = nn.CrossEntropyLoss()
    optimizer = SGD(model.parameters(), lr, momentum=momentum)
    #meters
    loss_meter = AverageValueMeter()
    acc_meter = AverageValueMeter()
    #writer
    writer = SummaryWriter(join(logdir, exp_name))
    #device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    #definiamo un dizionario contenente i loader di training e test
    loader = {'train': train_loader, 'valid': valid_loader}

    array_accuracy_train = []
    array_accuracy_valid = []
    array_loss_train = []
    array_loss_valid = []
    array_glb_train = []
    array_glb_valid = []

    last_loss_train = 0
    last_loss_val = 0
    last_acc_train = 0
    last_acc_val = 0
    #inizializziamo il global step
    global_step = 0
    tempo = Timer()
    start = timer()
    start_epoca = 0

    if dizionario is not None:
        print("Inizializza")
        array_accuracy_train = dizionario["a_train"]
        array_accuracy_valid = dizionario["a_valid"]
        array_loss_train = dizionario["l_train"]
        array_loss_valid = dizionario["l_valid"]
        array_glb_train = dizionario["g_train"]
        array_glb_valid = dizionario["g_valid"]
        global_step = dizionario["g_valid"][-1]
        start_epoca = dizionario["epoche_fatte"] + 1  # indice epoca di inizio

    print("global step", global_step)
    print("a_acc_train", array_accuracy_train)
    print("a_acc_valid", array_accuracy_valid)
    print("loss_train", array_loss_train)
    print("loss_valid", array_loss_valid)
    print("glb_train", array_glb_train)
    print("glb_valid", array_glb_valid)
    print("epoca_start_indice ", start_epoca)
    start = timer()

    print("Num epoche", epochs)

    for e in range(start_epoca, epochs):
        print("Epoca= ", e)
        #iteriamo tra due modalità: train e test
        for mode in ['train', 'valid']:
            loss_meter.reset()
            acc_meter.reset()
            model.train() if mode == 'train' else model.eval()
            with torch.set_grad_enabled(
                    mode == 'train'):  #abilitiamo i gradienti solo in training

                for i, batch in enumerate(loader[mode]):
                    print(batch['label'])

                    #x, y = [b.to(device) for b in batch]
                    x = batch['image'].to(
                        device)  #"portiamoli sul device corretto"
                    y = batch['label'].to(device)
                    output = model(x)

                    #aggiorniamo il global_step
                    #conterrà il numero di campioni visti durante il training
                    n = x.shape[0]  #numero di elementi nel batch
                    print("numero elementi nel batch ", n)
                    global_step += n

                    l = criterion(output, y)

                    if mode == 'train':
                        l.backward()
                        optimizer.step()
                        optimizer.zero_grad()

                    print("Etichette predette", output.to('cpu').max(1)[1])
                    acc = accuracy_score(y.to('cpu'),
                                         output.to('cpu').max(1)[1])

                    loss_meter.add(l.item(), n)
                    acc_meter.add(acc, n)
                    #loggiamo i risultati iterazione per iterazione solo durante il training
                    if mode == 'train':
                        writer.add_scalar('loss/train',
                                          loss_meter.value(),
                                          global_step=global_step)
                        writer.add_scalar('accuracy/train',
                                          acc_meter.value(),
                                          global_step=global_step)
                        print("Accuracy Train=", acc_meter.value())
                    #una volta finita l'epoca (sia nel caso di training che test, loggiamo le stime finali)

            if mode == 'train':
                global_step_train = global_step
                last_loss_train = loss_meter.value()
                last_acc_train = acc_meter.value()
                print("Accuracy Train=", acc_meter.value())
                array_accuracy_train.append(acc_meter.value())
                array_loss_train.append(loss_meter.value())
                array_glb_train.append(global_step)

            else:
                global_step_val = global_step
                last_loss_val = loss_meter.value()
                last_acc_val = acc_meter.value()
                print("Accuracy Valid=", acc_meter.value())
                array_accuracy_valid.append(acc_meter.value())
                array_loss_valid.append(loss_meter.value())
                array_glb_valid.append(global_step)

            writer.add_scalar('loss/' + mode,
                              loss_meter.value(),
                              global_step=global_step)
            writer.add_scalar('accuracy/' + mode,
                              acc_meter.value(),
                              global_step=global_step)

        print("Loss TRAIN", array_loss_train)
        print("Losss VALID", array_loss_valid)
        print("Accuracy TRAIN", array_accuracy_train)
        print("Accuracy VALID", array_accuracy_valid)
        print("dim acc train", len(array_accuracy_train))
        print("dim acc valid", len(array_accuracy_valid))
        figure = plt.figure(figsize=(12, 8))
        plt.plot(array_glb_train, array_accuracy_train)
        plt.plot(array_glb_valid, array_accuracy_valid)
        plt.xlabel('samples')
        plt.ylabel('accuracy')
        plt.grid()
        plt.legend(['Training', 'Valid'])
        plt.savefig(directory + '//plotAccuracy_' + version + '.png')
        plt.clf()
        plt.close(figure)

        figure = plt.figure(figsize=(12, 8))
        plt.plot(array_glb_train, array_loss_train)
        plt.plot(array_glb_valid, array_loss_valid)
        plt.xlabel('samples')
        plt.ylabel('loss')
        plt.grid()
        plt.legend(['Training', 'Valid'])
        plt.savefig(directory + '//plotLoss_' + version + '.png')
        plt.clf()
        plt.close(figure)

        #conserviamo i pesi del modello alla fine di un ciclo di training e test
        net_save(epochs,
                 model,
                 optimizer,
                 last_loss_train,
                 last_loss_val,
                 last_acc_train,
                 last_acc_val,
                 global_step_train,
                 global_step_val,
                 '%s.pth' % (exp_name + "_dict"),
                 dict_stato_no=True)

        #conserviamo i pesi del modello alla fine di un ciclo di training e test
        torch.save(
            model, directory + "//" + version + "//" + '%s.pth' %
            (exp_name + "_" + str(e)))
        torch.save(model, '%s.pth' % (exp_name))

        saveArray(directory, version, array_loss_train, array_loss_valid,
                  array_accuracy_train, array_accuracy_valid, array_glb_train,
                  array_glb_valid)

        saveinFileJson(start, directory, version, resize, batch_size, e, lr,
                       momentum, len(train_loader), array_accuracy_train[-1],
                       array_accuracy_valid[-1], array_loss_train[-1],
                       array_loss_valid[-1])

    f = '{:.7f}'.format(tempo.stop())
    return model, f, last_loss_train, last_loss_val, last_acc_train, last_acc_val
Beispiel #23
0
                                                   fitness_shaping)

        train_writer.add_scalar('fitness', raw_fitness.mean(), i)
        train_writer.add_scalar('fitness/std', raw_fitness.std(), i)
        for p_idx, p in enumerate(population.parameters()):
            train_writer.add_histogram('grads/%d' % p_idx, p.grad, i)
        for k, p in population.mixing_logits.items():
            train_writer.add_histogram(
                "entropy/%s" % k,
                t.distributions.Categorical(logits=p).entropy(), i)

        means = population.component_means  # (480, 5)
        dist = ((means.unsqueeze(0) - means.unsqueeze(1))**2).sum(
            dim=2).sqrt()  # (1, 480, 5,) - (480, 1, 5) = (480, 480, 5)
        train_writer.add_histogram("dist", dist, i)

        optim.step()
        sched.step()
        population.std *= 0.999
        mean_fit = raw_fitness.mean().item()
        pbar.set_description("avg fit: %.3f, std: %.3f" %
                             (mean_fit, raw_fitness.std().item()))

        all_params = population.parameters()

        t.save(all_params, 'last.t')
        if mean_fit > best_so_far:
            best_so_far = mean_fit
            t.save(all_params, 'best.t')
            util.upload_results('best.t')
def train(train_source_iter: ForeverDataIterator,
          train_target_iter: ForeverDataIterator, model: ImageClassifier,
          domain_adv: ConditionalDomainAdversarialLoss, optimizer: SGD,
          lr_sheduler: StepwiseLR, epoch: int, args: argparse.Namespace):
    batch_time = AverageMeter('Time', ':3.1f')
    data_time = AverageMeter('Data', ':3.1f')
    losses = AverageMeter('Loss', ':3.2f')
    trans_losses = AverageMeter('Trans Loss', ':3.2f')
    cls_accs = AverageMeter('Cls Acc', ':3.1f')
    domain_accs = AverageMeter('Domain Acc', ':3.1f')
    progress = ProgressMeter(
        args.iters_per_epoch,
        [batch_time, data_time, losses, trans_losses, cls_accs, domain_accs],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()
    domain_adv.train()

    end = time.time()
    for i in range(args.iters_per_epoch):
        lr_sheduler.step()

        # measure data loading time
        data_time.update(time.time() - end)

        x_s, labels_s = next(train_source_iter)
        x_t, _ = next(train_target_iter)

        x_s = x_s.to(device)
        x_t = x_t.to(device)
        labels_s = labels_s.to(device)

        # compute output
        x = torch.cat((x_s, x_t), dim=0)
        y, f = model(x)
        y_s, y_t = y.chunk(2, dim=0)
        f_s, f_t = f.chunk(2, dim=0)

        cls_loss = F.cross_entropy(y_s, labels_s)
        transfer_loss = domain_adv(y_s, f_s, y_t, f_t)
        domain_acc = domain_adv.domain_discriminator_accuracy
        loss = cls_loss + transfer_loss * args.trade_off

        cls_acc = accuracy(y_s, labels_s)[0]

        losses.update(loss.item(), x_s.size(0))
        cls_accs.update(cls_acc.item(), x_s.size(0))
        domain_accs.update(domain_acc.item(), x_s.size(0))
        trans_losses.update(transfer_loss.item(), x_s.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
Beispiel #25
0
    # Choose different scheduler to test
    scheduler = StepLR(optim, step_size=10, gamma=0.1)
    scheduler = MultiStepLR(optim, milestones=[3, 6, 9], gamma=0.1)
    scheduler = ReduceLROnPlateau(optim,
                                  threshold=0.99,
                                  mode='min',
                                  patience=2,
                                  cooldown=5)
    scheduler = WarmupLR(scheduler,
                         init_lr=0.01,
                         num_warmup=3,
                         warmup_strategy='cos')

    # this zero gradient update is needed to avoid a warning message, issue #8.
    optim.zero_grad()
    optim.step()

    # The wrapper doesn't affect old scheduler api
    # Simply plug and play
    for epoch in range(1, 20):
        # step with pseudo loss if we're using reducelronplateau
        if isinstance(scheduler._scheduler, ReduceLROnPlateau):
            pseudo_loss = 20 - epoch
            scheduler.step(pseudo_loss)
            print('Epoch: {} LR: {:.3f} pseudo loss: {:.2f}'.format(
                epoch, optim.param_groups[0]['lr'], pseudo_loss))
        # step without any parameters
        else:
            scheduler.step()
            print(epoch, optim.param_groups[0]['lr'])
        optim.step()  # backward pass (update network)
Beispiel #26
0
def main():
    # Arguments parser
    parser = argparse.ArgumentParser(
        description='Tuning with DNN Model for NER')
    # Model Hyperparameters
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        default='LSTM')
    parser.add_argument('--encoder_mode',
                        choices=['cnn', 'lstm'],
                        help='Encoder type for sentence encoding',
                        default='lstm')
    parser.add_argument('--char_method',
                        choices=['cnn', 'lstm'],
                        help='Method to create character-level embeddings',
                        required=True)
    parser.add_argument(
        '--hidden_size',
        type=int,
        default=128,
        help='Number of hidden units in RNN for sentence level')
    parser.add_argument('--char_hidden_size',
                        type=int,
                        default=30,
                        help='Output character-level embeddings size')
    parser.add_argument('--char_dim',
                        type=int,
                        default=30,
                        help='Dimension of Character embeddings')
    parser.add_argument('--tag_space',
                        type=int,
                        default=0,
                        help='Dimension of tag space')
    parser.add_argument('--num_layers',
                        type=int,
                        default=1,
                        help='Number of layers of RNN')
    parser.add_argument('--dropout',
                        choices=['std', 'weight_drop'],
                        help='Dropout method',
                        default='weight_drop')
    parser.add_argument('--p_em',
                        type=float,
                        default=0.33,
                        help='dropout rate for input embeddings')
    parser.add_argument('--p_in',
                        type=float,
                        default=0.33,
                        help='dropout rate for input of RNN model')
    parser.add_argument('--p_rnn',
                        nargs=2,
                        type=float,
                        required=True,
                        help='dropout rate for RNN')
    parser.add_argument('--p_out',
                        type=float,
                        default=0.33,
                        help='dropout rate for output layer')
    parser.add_argument('--bigram',
                        action='store_true',
                        help='bi-gram parameter for CRF')

    # Data loading and storing params
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--dataset_name',
                        type=str,
                        default='alexa',
                        help='Which dataset to use')
    parser.add_argument('--train',
                        type=str,
                        required=True,
                        help='Path of train set')
    parser.add_argument('--dev',
                        type=str,
                        required=True,
                        help='Path of dev set')
    parser.add_argument('--test',
                        type=str,
                        required=True,
                        help='Path of test set')
    parser.add_argument('--results_folder',
                        type=str,
                        default='results',
                        help='The folder to store results')
    parser.add_argument('--tmp_folder',
                        type=str,
                        default='tmp',
                        help='The folder to store tmp files')
    parser.add_argument('--alphabets_folder',
                        type=str,
                        default='data/alphabets',
                        help='The folder to store alphabets files')
    parser.add_argument('--result_file_name',
                        type=str,
                        default='hyperparameters_tuning',
                        help='File name to store some results')
    parser.add_argument('--result_file_path',
                        type=str,
                        default='results/hyperparameters_tuning',
                        help='File name to store some results')

    # Training parameters
    parser.add_argument('--cuda',
                        action='store_true',
                        help='whether using GPU')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Base learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.95,
                        help='Decay rate of learning rate')
    parser.add_argument('--schedule',
                        type=int,
                        default=3,
                        help='schedule for learning rate decay')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for l2 regularization')
    parser.add_argument('--max_norm',
                        type=float,
                        default=1.,
                        help='Max norm for gradients')
    parser.add_argument('--gpu_id',
                        type=int,
                        nargs='+',
                        required=True,
                        help='which gpu to use for training')

    # Misc
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'alexa'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--restore',
                        action='store_true',
                        help='whether restore from stored parameters')
    parser.add_argument('--save_checkpoint',
                        type=str,
                        default='',
                        help='the path to save the model')
    parser.add_argument('--o_tag',
                        type=str,
                        default='O',
                        help='The default tag for outside tag')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--evaluate_raw_format',
                        action='store_true',
                        help='The tagging format for evaluation')

    args = parser.parse_args()

    logger = get_logger("NERCRF")

    # rename the parameters
    mode = args.mode
    encoder_mode = args.encoder_mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    char_hidden_size = args.char_hidden_size
    char_method = args.char_method
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    max_norm = args.max_norm
    schedule = args.schedule
    dropout = args.dropout
    p_em = args.p_em
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    bigram = args.bigram
    embedding = args.embedding
    embedding_path = args.embedding_dict
    dataset_name = args.dataset_name
    result_file_name = args.result_file_name
    evaluate_raw_format = args.evaluate_raw_format
    o_tag = args.o_tag
    restore = args.restore
    save_checkpoint = args.save_checkpoint
    gpu_id = args.gpu_id
    results_folder = args.results_folder
    tmp_folder = args.tmp_folder
    alphabets_folder = args.alphabets_folder
    use_elmo = False
    p_em_vec = 0.
    result_file_path = args.result_file_path

    score_file = "%s/score_gpu_%s" % (tmp_folder, '-'.join(map(str, gpu_id)))

    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    if not os.path.exists(tmp_folder):
        os.makedirs(tmp_folder)
    if not os.path.exists(alphabets_folder):
        os.makedirs(alphabets_folder)

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, ner_alphabet = conll03_data.create_alphabets(
        "{}/{}/".format(alphabets_folder, dataset_name),
        train_path,
        data_paths=[dev_path, test_path],
        embedd_dict=embedd_dict,
        max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    device = torch.device('cuda') if args.cuda else torch.device('cpu')
    print(device)

    data_train = conll03_data.read_data_to_tensor(train_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  ner_alphabet,
                                                  device=device)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data_to_tensor(dev_path,
                                                word_alphabet,
                                                char_alphabet,
                                                ner_alphabet,
                                                device=device)
    data_test = conll03_data.read_data_to_tensor(test_path,
                                                 word_alphabet,
                                                 char_alphabet,
                                                 ner_alphabet,
                                                 device=device)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform_
    if args.dropout == 'std':
        network = BiRecurrentConvCRF(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     char_hidden_size,
                                     window,
                                     mode,
                                     encoder_mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     tag_space=tag_space,
                                     embedd_word=word_table,
                                     use_elmo=use_elmo,
                                     p_em_vec=p_em_vec,
                                     p_em=p_em,
                                     p_in=p_in,
                                     p_out=p_out,
                                     p_rnn=p_rnn,
                                     bigram=bigram,
                                     initializer=initializer)
    elif args.dropout == 'var':
        network = BiVarRecurrentConvCRF(embedd_dim,
                                        word_alphabet.size(),
                                        char_dim,
                                        char_alphabet.size(),
                                        char_hidden_size,
                                        window,
                                        mode,
                                        encoder_mode,
                                        hidden_size,
                                        num_layers,
                                        num_labels,
                                        tag_space=tag_space,
                                        embedd_word=word_table,
                                        use_elmo=use_elmo,
                                        p_em_vec=p_em_vec,
                                        p_em=p_em,
                                        p_in=p_in,
                                        p_out=p_out,
                                        p_rnn=p_rnn,
                                        bigram=bigram,
                                        initializer=initializer)
    else:
        network = BiWeightDropRecurrentConvCRF(embedd_dim,
                                               word_alphabet.size(),
                                               char_dim,
                                               char_alphabet.size(),
                                               char_hidden_size,
                                               window,
                                               mode,
                                               encoder_mode,
                                               hidden_size,
                                               num_layers,
                                               num_labels,
                                               tag_space=tag_space,
                                               embedd_word=word_table,
                                               p_em=p_em,
                                               p_in=p_in,
                                               p_out=p_out,
                                               p_rnn=p_rnn,
                                               bigram=bigram,
                                               initializer=initializer)

    network = network.to(device)

    lr = learning_rate
    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)
    # optim = Adam(network.parameters(), lr=lr, weight_decay=gamma, amsgrad=True)
    nn.utils.clip_grad_norm_(network.parameters(), max_norm)
    logger.info("Network: %s, encoder_mode=%s, num_layer=%d, hidden=%d, char_hidden_size=%d, char_method=%s, tag_space=%d, crf=%s" % \
        (mode, encoder_mode, num_layers, hidden_size, char_hidden_size, char_method, tag_space, 'bigram' if bigram else 'unigram'))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))

    num_batches = num_data // batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    best_test_f1 = 0.0
    best_test_acc = 0.0
    best_test_precision = 0.0
    best_test_recall = 0.0
    best_test_epoch = 0.0
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, args.dropout, lr, decay_rate, schedule))

        train_err = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            _, word, char, labels, masks, lengths = conll03_data.get_batch_tensor(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss = network.loss(_, word, char, labels, mask=masks)
            loss.backward()
            optim.step()

            with torch.no_grad():
                num_inst = word.size(0)
                train_err += loss * num_inst
                train_total += num_inst

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 20 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, time: %.2fs' %
              (num_batches, train_err / train_total, time.time() - start_time))

        # evaluate performance on dev data
        with torch.no_grad():
            network.eval()
            tmp_filename = '%s/gpu_%s_dev' % (tmp_folder, '-'.join(
                map(str, gpu_id)))
            writer.start(tmp_filename)

            for batch in conll03_data.iterate_batch_tensor(
                    data_dev, batch_size):
                _, word, char, labels, masks, lengths = batch
                preds, _ = network.decode(
                    _,
                    word,
                    char,
                    target=labels,
                    mask=masks,
                    leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.cpu().numpy(),
                             preds.cpu().numpy(),
                             labels.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            acc, precision, recall, f1 = evaluate(tmp_filename, score_file,
                                                  evaluate_raw_format, o_tag)
            print(
                'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%'
                % (acc, precision, recall, f1))

            if dev_f1 < f1:
                dev_f1 = f1
                dev_acc = acc
                dev_precision = precision
                dev_recall = recall
                best_epoch = epoch

                # evaluate on test data when better performance detected
                tmp_filename = '%s/gpu_%s_test' % (tmp_folder, '-'.join(
                    map(str, gpu_id)))
                writer.start(tmp_filename)

                for batch in conll03_data.iterate_batch_tensor(
                        data_test, batch_size):
                    _, word, char, labels, masks, lengths = batch
                    preds, _ = network.decode(
                        _,
                        word,
                        char,
                        target=labels,
                        mask=masks,
                        leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                    writer.write(word.cpu().numpy(),
                                 preds.cpu().numpy(),
                                 labels.cpu().numpy(),
                                 lengths.cpu().numpy())
                writer.close()
                test_acc, test_precision, test_recall, test_f1 = evaluate(
                    tmp_filename, score_file, evaluate_raw_format, o_tag)
                if best_test_f1 < test_f1:
                    best_test_acc, best_test_precision, best_test_recall, best_test_f1 = test_acc, test_precision, test_recall, test_f1
                    best_test_epoch = epoch

            print(
                "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
            print(
                "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (test_acc, test_precision, test_recall, test_f1, best_epoch))
            print(
                "overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
                % (best_test_acc, best_test_precision, best_test_recall,
                   best_test_f1, best_test_epoch))

        if epoch % schedule == 0:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)

    with open(result_file_path, 'a') as ofile:
        ofile.write(
            "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n"
            % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        ofile.write(
            "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n"
            % (test_acc, test_precision, test_recall, test_f1, best_epoch))
        ofile.write(
            "overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n\n"
            % (best_test_acc, best_test_precision, best_test_recall,
               best_test_f1, best_test_epoch))
    print('Training finished!')
        x = model.module.features[2](x)
        x = model.module.features[3](x)
        #x = x.view(x.size(0), model.module.nfscat*3, model.module.nspace, model.module.nspace)
        loss = -1.0* x[0, f_num, 2, 2]

        #https://towardsdatascience.com/pytorch-implementation-of-perceptual-losses-for-real-time-style-transfer-8d608e2e9902
        #reg_loss = REGULARIZATION * (
        #torch.sum(torch.abs(im_as_var[:, :, :-1] - im_as_var[ :, :, 1:])) + 
        #torch.sum(torch.abs(im_as_var[ :, :-1, :] - im_as_var[:, 1:, :]))
        #)
        reg_loss = 0

        loss = loss + reg_loss

        loss.backward()
        optimizer.step()

    recreated_im = copy.copy(im_as_var.data.cpu().numpy()[0]).transpose(2,1,0)
    #recreated_im = recreated_im[11:22,11:22,:]
    minned = recreated_im - np.min(recreated_im)
    ax1 = fig.add_subplot(num_rows, num_cols, importance + 1)
    ax1.imshow(minned/np.max(minned))
    ax1.axis('off')
    ax1.set_xticklabels([])
    ax1.set_yticklabels([])
    ax1.set_title("{0:.2f}".format(allFilters - scores[f_num]))
plt.subplots_adjust(wspace=1.0, hspace=0.1)
plt.savefig("deep_dream_alexnet_l2.png")

plt.close()
def train(opt):
    # set device to cpu/gpu
    if opt.use_gpu:
        device = torch.device("cuda", opt.gpu_id)
    else:
        device = torch.device("cpu")

    # Data transformations for data augmentation
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.RandomErasing(),
    ])
    transform_val = transforms.Compose([
        transforms.ToTensor(),
    ])

    # get CIFAR10/CIFAR100 train/val set
    if opt.dataset == "CIFAR10":
        alp_lambda = 0.5
        lambda_loss = [0.005, 0.001]
        train_set = CIFAR10(root="./data", train=True,
                            download=True, transform=transform_train)
        val_set = CIFAR10(root="./data", train=True,
                          download=True, transform=transform_val)
    else:
        alp_lambda = 0.5
        lambda_loss = [0.005, 0.001]
        train_set = CIFAR100(root="./data", train=True,
                             download=True, transform=transform_train)
        val_set = CIFAR100(root="./data", train=True,
                           download=True, transform=transform_val)
    num_classes = np.unique(train_set.targets).shape[0]

    # set stratified train/val split
    idx = list(range(len(train_set.targets)))
    train_idx, val_idx, _, _ = train_test_split(
        idx, train_set.targets, test_size=opt.val_split, random_state=42)

    # get train/val samplers
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)

    # get train/val dataloaders
    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=opt.batch_size,
                              num_workers=opt.num_workers)
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=opt.batch_size,
                            num_workers=opt.num_workers)

    data_loaders = {"train": train_loader, "val": val_loader}

    print("Dataset -- {}, Metric -- {}, Train Mode -- {}, Backbone -- {}".format(opt.dataset,
                                                                                 opt.metric, opt.train_mode, opt.backbone))
    print("Train iteration batch size: {}".format(opt.batch_size))
    print("Train iterations per epoch: {}".format(len(train_loader)))

    # get backbone model
    if opt.backbone == "resnet18":
        model = resnet18(pretrained=False)
    else:
        model = resnet34(pretrained=False)

    # set metric loss function
    in_features = model.fc.in_features
    model.fc = Softmax(in_features, num_classes)

    model.to(device)
    if opt.use_gpu:
        model = DataParallel(model).to(device)

    criterion = CrossEntropyLoss()
    mse_criterion = MSELoss()
    cent_criterion = CenterLoss(num_classes, in_features, device)

    # set optimizer and LR scheduler
    if opt.optimizer == "sgd":
        optimizer = SGD([{"params": model.parameters()}],
                        lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9)
        cent_optimizer = SGD([{"params": cent_criterion.parameters()}],
                             lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9)
    else:
        optimizer = Adam([{"params": model.parameters()}],
                         lr=opt.lr, weight_decay=opt.weight_decay)
        cent_optimizer = Adam([{"params": cent_criterion.parameters()}],
                              lr=opt.lr, weight_decay=opt.weight_decay)
    if opt.scheduler == "decay":
        scheduler = lr_scheduler.StepLR(
            optimizer, step_size=opt.lr_step, gamma=opt.lr_decay)
    else:
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer, factor=0.1, patience=10)

    # train/val loop
    for epoch in range(opt.epoch):
        for phase in ["train", "val"]:
            total_examples, total_correct, total_loss = 0, 0, 0

            if phase == "train":
                model.train()
            else:
                model.eval()

            start_time = time.time()
            for ii, data in enumerate(data_loaders[phase]):
                # load data batch to device
                images, labels = data
                images = images.to(device)
                labels = labels.to(device).long()

                # perform adversarial attack update to images
                if opt.train_mode == "at" or opt.train_mode == "alp":
                    adv_images = pgd(
                        model, images, labels, 8. / 255, 2. / 255, 7)
                else:
                    pass

                 # at train mode
                if opt.train_mode == "at":
                    # get feature embedding and logits from resnet
                    features, predictions = model(images, labels)
                    adv_features, adv_predictions = model(adv_images, labels)

                    # get center loss
                    cent_loss = cent_criterion(features, labels)
                    cent_loss = cent_loss + \
                        cent_criterion(adv_features, labels)

                    # get feature norm loss
                    norm = features.mm(features.t()).diag()
                    adv_norm = adv_features.mm(adv_features.t()).diag()
                    norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \
                        (features.size(0) + adv_features.size(0))

                    # get cross-entropy loss
                    ce_loss = criterion(predictions, labels)
                    ce_loss = ce_loss + criterion(adv_predictions, labels)

                    # combine cross-entropy loss, center loss and feature norm loss using lambda weights
                    loss = ce_loss + lambda_loss[0] * \
                        cent_loss + lambda_loss[1] * norm_loss
                    optimizer.zero_grad()
                    cent_optimizer.zero_grad()

                    # for result accumulation
                    predictions = adv_predictions

                # alp train mode
                elif opt.train_mode == "alp":
                    # get feature embedding and logits from resnet
                    features, predictions = model(images, labels)
                    adv_features, adv_predictions = model(adv_images, labels)

                    # get center loss
                    cent_loss = cent_criterion(features, labels)
                    cent_loss = cent_loss + \
                        cent_criterion(adv_features, labels)

                    # get feature norm loss
                    norm = features.mm(features.t()).diag()
                    adv_norm = adv_features.mm(adv_features.t()).diag()
                    norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \
                        (features.size(0) + adv_features.size(0))

                    # get cross-entropy loss
                    ce_loss = criterion(predictions, labels)
                    ce_loss = ce_loss + criterion(adv_predictions, labels)

                    # get alp loss
                    alp_loss = mse_criterion(adv_predictions, predictions)

                    # combine cross-entropy loss, center loss and feature norm loss using lambda weights
                    loss = ce_loss + lambda_loss[0] * \
                        cent_loss + lambda_loss[1] * norm_loss
                    # combine loss with alp loss
                    loss = loss + alp_lambda * alp_loss
                    optimizer.zero_grad()
                    cent_optimizer.zero_grad()

                    # for result accumulation
                    predictions = adv_predictions

                # clean train mode
                else:
                    # get feature embedding and logits from resnet
                    features, predictions = model(images, labels)

                    # get center loss
                    cent_loss = cent_criterion(features, labels)

                    # get feature norm loss
                    norm = features.mm(features.t()).diag()
                    norm_loss = torch.sum(norm) / features.size(0)

                    # get cross-entropy loss
                    ce_loss = criterion(predictions, labels)

                    # combine cross-entropy loss, center loss and feature norm loss using lambda weights
                    loss = ce_loss + lambda_loss[0] * \
                        cent_loss + lambda_loss[1] * norm_loss
                    optimizer.zero_grad()
                    cent_optimizer.zero_grad()

                # only take step if in train phase
                if phase == "train":
                    loss.backward()
                    optimizer.step()
                    cent_optimizer.step()

                # accumulate train or val results
                predictions = torch.argmax(predictions, 1)
                total_examples += predictions.size(0)
                total_correct += predictions.eq(labels).sum().item()
                total_loss += loss.item()

                # print accumulated train/val results at end of epoch
                if ii == len(data_loaders[phase]) - 1:
                    end_time = time.time()
                    acc = total_correct / total_examples
                    loss = total_loss / len(data_loaders[phase])
                    print("{}: Epoch -- {} Loss -- {:.6f} Acc -- {:.6f} Time -- {:.6f}sec".format(
                        phase, epoch, loss, acc, end_time - start_time))

                    if phase == "train":
                        loss = total_loss / len(data_loaders[phase])
                        scheduler.step(loss)
                    else:
                        print("")

    # save model after training for opt.epoch
    save_model(model, opt.dataset, opt.metric, opt.train_mode, opt.backbone)
Beispiel #29
0
class AR1(BaseStrategy):
    """
    The AR1 strategy with Latent Replay.

    This implementations allows for the use of both Synaptic Intelligence and
    Latent Replay to protect the lower level of the model from forgetting.

    While the original papers show how to use those two techniques in a mutual
    exclusive way, this implementation allows for the use of both of them
    concurrently. This behaviour is controlled by passing proper constructor
    arguments).
    """
    def __init__(self,
                 criterion=None,
                 lr: float = 0.001,
                 momentum=0.9,
                 l2=0.0005,
                 train_epochs: int = 4,
                 init_update_rate: float = 0.01,
                 inc_update_rate=0.00005,
                 max_r_max=1.25,
                 max_d_max=0.5,
                 inc_step=4.1e-05,
                 rm_sz: int = 1500,
                 freeze_below_layer: str = "lat_features.19.bn.beta",
                 latent_layer_num: int = 19,
                 ewc_lambda: float = 0,
                 train_mb_size: int = 128,
                 eval_mb_size: int = 128,
                 device=None,
                 plugins: Optional[Sequence[StrategyPlugin]] = None,
                 evaluator: EvaluationPlugin = default_logger,
                 eval_every=-1):
        """
        Creates an instance of the AR1 strategy.

        :param criterion: The loss criterion to use. Defaults to None, in which
            case the cross entropy loss is used.
        :param lr: The learning rate (SGD optimizer).
        :param momentum: The momentum (SGD optimizer).
        :param l2: The L2 penalty used for weight decay.
        :param train_epochs: The number of training epochs. Defaults to 4.
        :param init_update_rate: The initial update rate of BatchReNorm layers.
        :param inc_update_rate: The incremental update rate of BatchReNorm
            layers.
        :param max_r_max: The maximum r value of BatchReNorm layers.
        :param max_d_max: The maximum d value of BatchReNorm layers.
        :param inc_step: The incremental step of r and d values of BatchReNorm
            layers.
        :param rm_sz: The size of the replay buffer. The replay buffer is shared
            across classes. Defaults to 1500.
        :param freeze_below_layer: A string describing the name of the layer
            to use while freezing the lower (nearest to the input) part of the
            model. The given layer is not frozen (exclusive).
        :param latent_layer_num: The number of the layer to use as the Latent
            Replay Layer. Usually this is the same of `freeze_below_layer`.
        :param ewc_lambda: The Synaptic Intelligence lambda term. Defaults to
            0, which means that the Synaptic Intelligence regularization
            will not be applied.
        :param train_mb_size: The train minibatch size. Defaults to 128.
        :param eval_mb_size: The eval minibatch size. Defaults to 128.
        :param device: The device to use. Defaults to None (cpu).
        :param plugins: (optional) list of StrategyPlugins.
        :param evaluator: (optional) instance of EvaluationPlugin for logging
            and metric computations.
        :param eval_every: the frequency of the calls to `eval` inside the
            training loop.
                if -1: no evaluation during training.
                if  0: calls `eval` after the final epoch of each training
                    experience.
                if >0: calls `eval` every `eval_every` epochs and at the end
                    of all the epochs for a single experience.
        """

        warnings.warn("The AR1 strategy implementation is in an alpha stage "
                      "and is not perfectly aligned with the paper "
                      "implementation. Please use at your own risk!")

        if plugins is None:
            plugins = []

        # Model setup
        model = MobilenetV1(pretrained=True, latent_layer_num=latent_layer_num)
        replace_bn_with_brn(model,
                            momentum=init_update_rate,
                            r_d_max_inc_step=inc_step,
                            max_r_max=max_r_max,
                            max_d_max=max_d_max)

        fc_name, fc_layer = get_last_fc_layer(model)

        if ewc_lambda != 0:
            # Synaptic Intelligence is not applied to the last fully
            # connected layer (and implicitly to "freeze below" ones.
            plugins.append(
                SynapticIntelligencePlugin(ewc_lambda,
                                           excluded_parameters=[fc_name]))

        self.cwr_plugin = CWRStarPlugin(model,
                                        cwr_layer_name=fc_name,
                                        freeze_remaining_model=False)
        plugins.append(self.cwr_plugin)

        optimizer = SGD(model.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=l2)

        if criterion is None:
            criterion = CrossEntropyLoss()

        self.ewc_lambda = ewc_lambda
        self.freeze_below_layer = freeze_below_layer
        self.rm_sz = rm_sz
        self.inc_update_rate = inc_update_rate
        self.max_r_max = max_r_max
        self.max_d_max = max_d_max
        self.lr = lr
        self.momentum = momentum
        self.l2 = l2
        self.rm = None
        self.cur_acts: Optional[Tensor] = None
        self.replay_mb_size = 0

        super().__init__(model,
                         optimizer,
                         criterion,
                         train_mb_size=train_mb_size,
                         train_epochs=train_epochs,
                         eval_mb_size=eval_mb_size,
                         device=device,
                         plugins=plugins,
                         evaluator=evaluator,
                         eval_every=eval_every)

    def before_training_exp(self, **kwargs):
        self.model.eval()
        self.model.end_features.train()
        self.model.output.train()

        if self.training_exp_counter > 0:
            # In AR1 batch 0 is treated differently as the feature extractor is
            # left more free to learn.
            # This if is executed for batch > 0, in which we freeze layers
            # below "self.freeze_below_layer" (which usually is the latent
            # replay layer!) and we also change the parameters of BatchReNorm
            # layers to a more conservative configuration.

            # "freeze_up_to" will freeze layers below "freeze_below_layer"
            # Beware that Batch ReNorm layers are not frozen!
            freeze_up_to(self.model,
                         freeze_until_layer=self.freeze_below_layer,
                         layer_filter=AR1.filter_bn_and_brn)

            # Adapt the parameters of BatchReNorm layers
            change_brn_pars(self.model,
                            momentum=self.inc_update_rate,
                            r_d_max_inc_step=0,
                            r_max=self.max_r_max,
                            d_max=self.max_d_max)

            # Adapt the model and optimizer
            self.model = self.model.to(self.device)
            self.optimizer = SGD(self.model.parameters(),
                                 lr=self.lr,
                                 momentum=self.momentum,
                                 weight_decay=self.l2)

        # super()... will run S.I. and CWR* plugin callbacks
        super().before_training_exp(**kwargs)

        # Update cur_j of CWR* to consider latent patterns
        if self.training_exp_counter > 0:
            for class_id, count in examples_per_class(self.rm[1]).items():
                self.model.cur_j[class_id] += count
            self.cwr_plugin.cur_class = [
                cls for cls in set(self.model.cur_j.keys())
                if self.model.cur_j[cls] > 0
            ]
            self.cwr_plugin.reset_weights(self.cwr_plugin.cur_class)

    def make_train_dataloader(self, num_workers=0, shuffle=True, **kwargs):
        """
        Called after the dataset instantiation. Initialize the data loader.

        For AR1 a "custom" dataloader is used: instead of using
        `self.train_mb_size` as the batch size, the data loader batch size will
        be computed ad `self.train_mb_size - latent_mb_size`. `latent_mb_size`
        is in turn computed as:

        `
        len(train_dataset) // ((len(train_dataset) + len(replay_buffer)
        // self.train_mb_size)
        `

        so that the number of iterations required to run an epoch on the current
        batch is equal to the number of iterations required to run an epoch
        on the replay buffer.

        :param num_workers: number of thread workers for the data loading.
        :param shuffle: True if the data should be shuffled, False otherwise.
        """

        current_batch_mb_size = self.train_mb_size

        if self.training_exp_counter > 0:
            train_patterns = len(self.adapted_dataset)
            current_batch_mb_size = train_patterns // (
                (train_patterns + self.rm_sz) // self.train_mb_size)

        current_batch_mb_size = max(1, current_batch_mb_size)
        self.replay_mb_size = max(0,
                                  self.train_mb_size - current_batch_mb_size)

        # AR1 only supports SIT scenarios (no task labels).
        assert len(self.adapted_dataset.keys()) == 1
        curr_data = list(self.adapted_dataset.values())[0]
        self.current_dataloader = DataLoader(curr_data,
                                             num_workers=num_workers,
                                             batch_size=current_batch_mb_size,
                                             shuffle=shuffle)

    def training_epoch(self, **kwargs):
        for self.mb_it, (self.mb_x, self.mb_y, _) in \
                enumerate(self.current_dataloader):
            self.before_training_iteration(**kwargs)

            self.optimizer.zero_grad()
            self.mb_x = self.mb_x.to(self.device)
            self.mb_y = self.mb_y.to(self.device)

            if self.training_exp_counter > 0:
                lat_mb_x = self.rm[0][self.mb_it *
                                      self.replay_mb_size:(self.mb_it + 1) *
                                      self.replay_mb_size]
                lat_mb_x = lat_mb_x.to(self.device)
                lat_mb_y = self.rm[1][self.mb_it *
                                      self.replay_mb_size:(self.mb_it + 1) *
                                      self.replay_mb_size]
                lat_mb_y = lat_mb_y.to(self.device)
                self.mb_y = torch.cat((self.mb_y, lat_mb_y), 0)
            else:
                lat_mb_x = None

            # Forward pass. Here we are injecting latent patterns lat_mb_x.
            # lat_mb_x will be None for the very first batch (batch 0), which
            # means that lat_acts.shape[0] == self.mb_x[0].
            self.before_forward(**kwargs)
            self.logits, lat_acts = self.model(self.mb_x,
                                               latent_input=lat_mb_x,
                                               return_lat_acts=True)

            if self.epoch == 0:
                # On the first epoch only: store latent activations. Those
                # activations will be used to update the replay buffer.
                lat_acts = lat_acts.detach().clone().cpu()
                if self.mb_it == 0:
                    self.cur_acts = lat_acts
                else:
                    self.cur_acts = torch.cat((self.cur_acts, lat_acts), 0)
            self.after_forward(**kwargs)

            # Loss & Backward
            # We don't need to handle latent replay, as self.mb_y already
            # contains both current and replay labels.
            self.loss = self.criterion(self.logits, self.mb_y)
            self.before_backward(**kwargs)
            self.loss.backward()
            self.after_backward(**kwargs)

            # Optimization step
            self.before_update(**kwargs)
            self.optimizer.step()
            self.after_update(**kwargs)

            self.after_training_iteration(**kwargs)

    def after_training_exp(self, **kwargs):
        h = min(self.rm_sz // (self.training_exp_counter + 1),
                self.cur_acts.size(0))

        curr_data = self.experience.dataset
        idxs_cur = torch.randperm(self.cur_acts.size(0))[:h]
        rm_add_y = torch.tensor(
            [curr_data.targets[idx_cur] for idx_cur in idxs_cur])

        rm_add = [self.cur_acts[idxs_cur], rm_add_y]

        # replace patterns in random memory
        if self.training_exp_counter == 0:
            self.rm = rm_add
        else:
            idxs_2_replace = torch.randperm(self.rm[0].size(0))[:h]
            for j, idx in enumerate(idxs_2_replace):
                idx = int(idx)
                self.rm[0][idx] = rm_add[0][j]
                self.rm[1][idx] = rm_add[1][j]

        self.cur_acts = None

        # Runs S.I. and CWR* plugin callbacks
        super().after_training_exp(**kwargs)

    @staticmethod
    def filter_bn_and_brn(param_def: LayerAndParameter):
        return not isinstance(param_def.layer, (_NormBase, BatchRenorm2D))
Beispiel #30
0
def train_model(model_name,
                model,
                lr=LEARNING_RATE,
                epochs=EPOCHS,
                momentum=MOMENTUM,
                weight_decay=0,
                train_loader=training_set_loader,
                test_loader=validation_set_loader):

    if not os.path.exists(RESULTS_PATH + "/" + model_name):
        os.makedirs(RESULTS_PATH + "/" + model_name)

    criterion = nn.CrossEntropyLoss()
    optimizer = SGD(model.parameters(),
                    lr,
                    momentum=momentum,
                    weight_decay=weight_decay)

    loaders = {'train': train_loader, 'test': test_loader}
    losses = {'train': [], 'test': []}
    accuracies = {'train': [], 'test': []}

    #testing variables
    y_testing = []
    preds = []

    if USE_CUDA and cuda_available:
        model = model.cuda()

    for e in range(epochs):
        for mode in ['train', 'test']:
            if mode == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0
            epoch_acc = 0
            samples = 0

            try:
                for i, batch in enumerate(loaders[mode]):
                    # convert tensor to variable
                    x = Variable(batch['image'],
                                 requires_grad=(mode == 'train'))
                    y = Variable(batch['label'])

                    if USE_CUDA and cuda_available:
                        x = x.cuda()
                        y = y.cuda()

                    output = model(x)
                    l = criterion(output, y)  # loss

                    if mode == 'train':
                        l.backward()
                        optimizer.step()
                        optimizer.zero_grad()
                    else:
                        y_testing.extend(y.data.tolist())
                        preds.extend(output.max(1)[1].tolist())

                    if USE_CUDA and cuda_available:
                        acc = accuracy_score(
                            y.data.cuda().cpu().numpy(),
                            output.max(1)[1].cuda().cpu().numpy())
                    else:
                        acc = accuracy_score(y.data, output.max(1)[1])

                    epoch_loss += l.data.item() * x.shape[0]  # l.data[0]
                    epoch_acc += acc * x.shape[0]
                    samples += x.shape[0]

                    print ("\r[%s] Epoch %d/%d. Iteration %d/%d. Loss: %0.2f. Accuracy: %0.2f" % \
                        (mode, e+1, epochs, i, len(loaders[mode]), epoch_loss/samples, epoch_acc/samples))

                    if DEBUG and i == 2:
                        break
            except Exception as err:
                print("\n\n######### ERROR #######")
                print(str(err))
                print("\n\n######### batch #######")
                print(batch['img_name'])
                print("\n\n")

            epoch_loss /= samples
            epoch_acc /= samples

            losses[mode].append(epoch_loss)
            accuracies[mode].append(epoch_acc)


            print ("\r[%s] Epoch %d/%d. Iteration %d/%d. Loss: %0.2f. Accuracy: %0.2f" % \
                  (mode, e+1, epochs, i, len(loaders[mode]), epoch_loss, epoch_acc))

    torch.save(
        model.state_dict(),
        str(RESULTS_PATH) + "/" + str(model_name) + "/" + str(model_name) +
        ".pt")
    return model, (losses, accuracies), y_testing, preds
                        batch_loader.chars_vocab_size)

    neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
    if args.use_cuda:
        neg_loss = neg_loss.cuda()

    # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):

        input_idx, target_idx = batch_loader.next_embedding_seq(args.batch_size)

        input = Variable(t.from_numpy(input_idx).long())
        target = Variable(t.from_numpy(target_idx).long())
        if args.use_cuda:
            input, target = input.cuda(), target.cuda()

        out = neg_loss(input, target, args.num_sample).mean()

        optimizer.zero_grad()
        out.backward()
        optimizer.step()

        if iteration % 500 == 0:
            out = out.cpu().data.numpy()[0]
            print('iteration = {}, loss = {}'.format(iteration, out))

    word_embeddings = neg_loss.input_embeddings()
    np.save('data/word_embeddings.npy', word_embeddings)
Beispiel #32
0
def train(train_source_iter: ForeverDataIterator,
          train_target_iter: ForeverDataIterator, classifier: ImageClassifier,
          mdd: MarginDisparityDiscrepancy, optimizer: SGD,
          lr_scheduler: LambdaLR, epoch: int, args: argparse.Namespace):
    batch_time = AverageMeter('Time', ':3.1f')
    data_time = AverageMeter('Data', ':3.1f')
    losses = AverageMeter('Loss', ':3.2f')
    trans_losses = AverageMeter('Trans Loss', ':3.2f')
    cls_accs = AverageMeter('Cls Acc', ':3.1f')
    tgt_accs = AverageMeter('Tgt Acc', ':3.1f')

    progress = ProgressMeter(
        args.iters_per_epoch,
        [batch_time, data_time, losses, trans_losses, cls_accs, tgt_accs],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    classifier.train()
    mdd.train()

    criterion = nn.CrossEntropyLoss().to(device)

    end = time.time()
    for i in range(args.iters_per_epoch):
        optimizer.zero_grad()

        x_s, labels_s = next(train_source_iter)
        x_t, labels_t = next(train_target_iter)

        x_s = x_s.to(device)
        x_t = x_t.to(device)
        labels_s = labels_s.to(device)
        labels_t = labels_t.to(device)

        # measure data loading time
        data_time.update(time.time() - end)

        # compute output
        x = torch.cat((x_s, x_t), dim=0)
        outputs, outputs_adv = classifier(x)
        y_s, y_t = outputs.chunk(2, dim=0)
        y_s_adv, y_t_adv = outputs_adv.chunk(2, dim=0)

        # compute cross entropy loss on source domain
        cls_loss = criterion(y_s, labels_s)
        # compute margin disparity discrepancy between domains
        # for adversarial classifier, minimize negative mdd is equal to maximize mdd
        transfer_loss = -mdd(y_s, y_s_adv, y_t, y_t_adv)
        loss = cls_loss + transfer_loss * args.trade_off
        classifier.step()

        cls_acc = accuracy(y_s, labels_s)[0]
        tgt_acc = accuracy(y_t, labels_t)[0]

        losses.update(loss.item(), x_s.size(0))
        cls_accs.update(cls_acc.item(), x_s.size(0))
        tgt_accs.update(tgt_acc.item(), x_t.size(0))
        trans_losses.update(transfer_loss.item(), x_s.size(0))

        # compute gradient and do SGD step
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
Beispiel #33
0
def train(**kwargs):
    opt.parse(kwargs)

    images, tags, labels = load_data(opt.data_path)
    pretrain_model = load_pretrain_model(opt.pretrain_model_path)
    y_dim = tags.shape[1]

    X, Y, L = split_data(images, tags, labels)
    print('...loading and splitting data finish')

    img_model = ImgModule(opt.bit, pretrain_model)
    txt_model = TxtModule(y_dim, opt.bit)
    if opt.use_gpu:
        img_model = img_model.cuda()
        txt_model = txt_model.cuda()

    train_L = torch.from_numpy(L['train'])
    train_x = torch.from_numpy(X['train'])
    train_y = torch.from_numpy(Y['train'])

    query_L = torch.from_numpy(L['query'])
    query_x = torch.from_numpy(X['query'])
    query_y = torch.from_numpy(Y['query'])

    retrieval_L = torch.from_numpy(L['retrieval'])
    retrieval_x = torch.from_numpy(X['retrieval'])
    retrieval_y = torch.from_numpy(Y['retrieval'])

    num_train = train_x.shape[0]

    F_buffer = torch.randn(num_train, opt.bit)
    G_buffer = torch.randn(num_train, opt.bit)

    if opt.use_gpu:
        train_L = train_L.cuda()
        F_buffer = F_buffer.cuda()
        G_buffer = G_buffer.cuda()

    Sim = calc_neighbor(train_L, train_L)
    B = torch.sign(F_buffer + G_buffer)

    batch_size = opt.batch_size

    lr = opt.lr
    optimizer_img = SGD(img_model.parameters(), lr=lr)
    optimizer_txt = SGD(txt_model.parameters(), lr=lr)

    learning_rate = np.linspace(opt.lr, np.power(10, -6.), opt.max_epoch + 1)
    result = {'loss': []}

    ones = torch.ones(batch_size, 1)
    ones_ = torch.ones(num_train - batch_size, 1)
    unupdated_size = num_train - batch_size

    max_mapi2t = max_mapt2i = 0.

    for epoch in range(opt.max_epoch):
        # train image net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0:batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)

            sample_L = Variable(train_L[ind, :])
            image = Variable(train_x[ind].type(torch.float))
            if opt.use_gpu:
                image = image.cuda()
                sample_L = sample_L.cuda()
                ones = ones.cuda()
                ones_ = ones_.cuda()

            # similar matrix size: (batch_size, num_train)
            S = calc_neighbor(sample_L, train_L)  # S: (batch_size, num_train)
            cur_f = img_model(image)  # cur_f: (batch_size, bit)
            F_buffer[ind, :] = cur_f.data
            F = Variable(F_buffer)
            G = Variable(G_buffer)

            theta_x = 1.0 / 2 * torch.matmul(cur_f, G.t())
            logloss_x = -torch.sum(S * theta_x -
                                   torch.log(1.0 + torch.exp(theta_x)))
            quantization_x = torch.sum(torch.pow(B[ind, :] - cur_f, 2))
            balance_x = torch.sum(
                torch.pow(cur_f.t().mm(ones) + F[unupdated_ind].t().mm(ones_),
                          2))
            loss_x = logloss_x + opt.gamma * quantization_x + opt.eta * balance_x
            loss_x /= (batch_size * num_train)

            optimizer_img.zero_grad()
            loss_x.backward()
            optimizer_img.step()

        # train txt net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0:batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)

            sample_L = Variable(train_L[ind, :])
            text = train_y[ind, :].unsqueeze(1).unsqueeze(-1).type(torch.float)
            text = Variable(text)
            if opt.use_gpu:
                text = text.cuda()
                sample_L = sample_L.cuda()

            # similar matrix size: (batch_size, num_train)
            S = calc_neighbor(sample_L, train_L)  # S: (batch_size, num_train)
            cur_g = txt_model(text)  # cur_f: (batch_size, bit)
            G_buffer[ind, :] = cur_g.data
            F = Variable(F_buffer)
            G = Variable(G_buffer)

            # calculate loss
            # theta_y: (batch_size, num_train)
            theta_y = 1.0 / 2 * torch.matmul(cur_g, F.t())
            logloss_y = -torch.sum(S * theta_y -
                                   torch.log(1.0 + torch.exp(theta_y)))
            quantization_y = torch.sum(torch.pow(B[ind, :] - cur_g, 2))
            balance_y = torch.sum(
                torch.pow(cur_g.t().mm(ones) + G[unupdated_ind].t().mm(ones_),
                          2))
            loss_y = logloss_y + opt.gamma * quantization_y + opt.eta * balance_y
            loss_y /= (num_train * batch_size)

            optimizer_txt.zero_grad()
            loss_y.backward()
            optimizer_txt.step()

        # update B
        B = torch.sign(F_buffer + G_buffer)

        # calculate total loss
        loss = calc_loss(B, F, G, Variable(Sim), opt.gamma, opt.eta)

        print('...epoch: %3d, loss: %3.3f, lr: %f' %
              (epoch + 1, loss.data, lr))
        result['loss'].append(float(loss.data))

        if opt.valid:
            mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x,
                                   query_y, retrieval_y, query_L, retrieval_L)
            print(
                '...epoch: %3d, valid MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f'
                % (epoch + 1, mapi2t, mapt2i))
            if mapt2i >= max_mapt2i and mapi2t >= max_mapi2t:
                max_mapi2t = mapi2t
                max_mapt2i = mapt2i
                img_model.save(img_model.module_name + '.pth')
                txt_model.save(txt_model.module_name + '.pth')

        lr = learning_rate[epoch + 1]

        # set learning rate
        for param in optimizer_img.param_groups:
            param['lr'] = lr
        for param in optimizer_txt.param_groups:
            param['lr'] = lr

    print('...training procedure finish')
    if opt.valid:
        print('   max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' %
              (max_mapi2t, max_mapt2i))
        result['mapi2t'] = max_mapi2t
        result['mapt2i'] = max_mapt2i
    else:
        mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x,
                               query_y, retrieval_y, query_L, retrieval_L)
        print('   max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' %
              (mapi2t, mapt2i))
        result['mapi2t'] = mapi2t
        result['mapt2i'] = mapt2i

    write_result(result)
Beispiel #34
0
def train(model, state, path, annotations, val_path, val_annotations, resize, max_size, jitter, batch_size, iterations, val_iterations, mixed_precision, lr, warmup, milestones, gamma, is_master=True, world=1, use_dali=True, verbose=True, metrics_url=None, logdir=None):
    'Train the model on the given dataset'
    print("This is train.py, lr = ", lr)
    # Prepare model
    nn_model = model
    stride = model.stride

    model = convert_fixedbn_model(model)
    if torch.cuda.is_available():
        model = model.cuda()

    # Setup optimizer and schedule
    optimizer = SGD(model.parameters(), lr=lr, weight_decay=0.0001, momentum=0.9) 

    model, optimizer = amp.initialize(model, optimizer,
                                      opt_level = 'O2' if mixed_precision else 'O0',
                                      keep_batchnorm_fp32 = True,
                                      loss_scale = 128.0,
                                      verbosity = is_master)
    print("This is train.py/train, optimizer param_groups, before: ")
    print(optimizer.state_dict()['param_groups'])
    if world > 1: 
        model = DistributedDataParallel(model)
    model.train()

    if 'optimizer' in state:
        #print("This is state['optimizer']")
        #print(state['optimizer'])
        optimizer.load_state_dict(state['optimizer'])
        for g in optimizer.param_groups:
            g['lr'] = lr
            g['initial_lr'] = lr
    print("This is train.py/train, optimizer param_groups, after: ")
    print(optimizer.state_dict()['param_groups'])
    #print(optimizer.param_groups)
    def schedule(train_iter):
        if warmup and train_iter <= warmup:
            return 0.9 * train_iter / warmup + 0.1
        return gamma ** len([m for m in milestones if m <= train_iter])
    scheduler = LambdaLR(optimizer, schedule)

    # Prepare dataset
    if verbose: print('Preparing dataset...')
    data_iterator = (DaliDataIterator if use_dali else DataIterator)(
        path, jitter, max_size, batch_size, stride,
        world, annotations, training=True)
    if verbose: print(data_iterator)


    if verbose:
        print('    device: {} {}'.format(
            world, 'cpu' if not torch.cuda.is_available() else 'gpu' if world == 1 else 'gpus'))
        print('    batch: {}, precision: {}'.format(batch_size, 'mixed' if mixed_precision else 'full'))
        print('Training model for {} iterations...'.format(iterations))

    # Create TensorBoard writer
    if logdir is not None:
        from tensorboardX import SummaryWriter
        if is_master and verbose:
            print('Writing TensorBoard logs to: {}'.format(logdir))
        writer = SummaryWriter(logdir=logdir)

    profiler = Profiler(['train', 'fw', 'bw'])
    iteration = state.get('iteration', 0)
    while iteration < iterations:
        cls_losses, box_losses = [], []
        for i, (data, target) in enumerate(data_iterator):
            scheduler.step(iteration)

            # Forward pass
            profiler.start('fw')

            optimizer.zero_grad()
            cls_loss, box_loss = model([data, target])
            del data
            profiler.stop('fw')

            # Backward pass
            profiler.start('bw')
            with amp.scale_loss(cls_loss + box_loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()

            # Reduce all losses
            cls_loss, box_loss = cls_loss.mean().clone(), box_loss.mean().clone()
            if world > 1:
                torch.distributed.all_reduce(cls_loss)
                torch.distributed.all_reduce(box_loss)
                cls_loss /= world
                box_loss /= world
            if is_master:
                cls_losses.append(cls_loss)
                box_losses.append(box_loss)

            if is_master and not isfinite(cls_loss + box_loss):
                raise RuntimeError('Loss is diverging!\n{}'.format(
                    'Try lowering the learning rate.'))

            del cls_loss, box_loss
            profiler.stop('bw')

            iteration += 1
            profiler.bump('train')
            if is_master and (profiler.totals['train'] > 60 or iteration == iterations):
                focal_loss = torch.stack(list(cls_losses)).mean().item()
                box_loss = torch.stack(list(box_losses)).mean().item()
                learning_rate = optimizer.param_groups[0]['lr']
                if verbose:
                    msg  = '[{:{len}}/{}]'.format(iteration, iterations, len=len(str(iterations)))
                    msg += ' focal loss: {:.3f}'.format(focal_loss)
                    msg += ', box loss: {:.3f}'.format(box_loss)
                    msg += ', {:.3f}s/{}-batch'.format(profiler.means['train'], batch_size)
                    msg += ' (fw: {:.3f}s, bw: {:.3f}s)'.format(profiler.means['fw'], profiler.means['bw'])
                    msg += ', {:.1f} im/s'.format(batch_size / profiler.means['train'])
                    msg += ', lr: {:.2g}'.format(learning_rate)
                    print(msg, flush=True)

                if logdir is not None:
                    writer.add_scalar('focal_loss', focal_loss,  iteration)
                    writer.add_scalar('box_loss', box_loss, iteration)
                    writer.add_scalar('learning_rate', learning_rate, iteration)
                    del box_loss, focal_loss

                if metrics_url:
                    post_metrics(metrics_url, {
                        'focal loss': mean(cls_losses),
                        'box loss': mean(box_losses),
                        'im_s': batch_size / profiler.means['train'],
                        'lr': learning_rate
                    })

                # Save model weights
                state.update({
                    'iteration': iteration,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                })
                with ignore_sigint():
                    nn_model.save(state)

                profiler.reset()
                del cls_losses[:], box_losses[:]

            if val_annotations and (iteration == iterations or iteration % val_iterations == 0):
                infer(model, val_path, None, resize, max_size, batch_size, annotations=val_annotations,
                    mixed_precision=mixed_precision, is_master=is_master, world=world, use_dali=use_dali, is_validation=True, verbose=False)
                model.train()

            if iteration == iterations:
                break

    if logdir is not None:
        writer.close()