def generate_inverted_image_specific_layer(self, input_image, img_size, target_layer=3):
        # Generate a random image which we will optimize
        opt_img = Variable(1e-1 * torch.randn(1, 3, img_size, img_size), requires_grad=True)
        # Define optimizer for previously created image
        optimizer = SGD([opt_img], lr=1e4, momentum=0.9)
        # Get the output from the model after a forward pass until target_layer
        # with the input image (real image, NOT the randomly generated one)
        input_image_layer_output = \
            self.get_output_from_specific_layer(input_image, target_layer)

        # Alpha regularization parametrs
        # Parameter alpha, which is actually sixth norm
        alpha_reg_alpha = 6
        # The multiplier, lambda alpha
        alpha_reg_lambda = 1e-7

        # Total variation regularization parameters
        # Parameter beta, which is actually second norm
        tv_reg_beta = 2
        # The multiplier, lambda beta
        tv_reg_lambda = 1e-8

        for i in range(201):
            optimizer.zero_grad()
            # Get the output from the model after a forward pass until target_layer
            # with the generated image (randomly generated one, NOT the real image)
            output = self.get_output_from_specific_layer(opt_img, target_layer)
            # Calculate euclidian loss
            euc_loss = 1e-1 * self.euclidian_loss(input_image_layer_output.detach(), output)
            # Calculate alpha regularization
            reg_alpha = alpha_reg_lambda * self.alpha_norm(opt_img, alpha_reg_alpha)
            # Calculate total variation regularization
            reg_total_variation = tv_reg_lambda * self.total_variation_norm(opt_img,
                                                                            tv_reg_beta)
            # Sum all to optimize
            loss = euc_loss + reg_alpha + reg_total_variation
            # Step
            loss.backward()
            optimizer.step()
            # Generate image every 5 iterations
            if i % 5 == 0:
                print('Iteration:', str(i), 'Loss:', loss.data.numpy()[0])
                x = recreate_image(opt_img)
                cv2.imwrite('../generated/Inv_Image_Layer_' + str(target_layer) +
                            '_Iteration_' + str(i) + '.jpg', x)
            # Reduce learning rate every 40 iterations
            if i % 40 == 0:
                for param_group in optimizer.param_groups:
                    param_group['lr'] *= 1/10
Beispiel #2
0
        state = state.cuda()
        hx = hx.cuda()
        cx = cx.cuda()
        score = score.cuda()

    last_state = torch.squeeze(last_state, 0)
    last_hx = torch.squeeze(last_hx, 0)
    last_cx = torch.squeeze(last_cx, 0)
    cmd = torch.squeeze(cmd, 0)
    last_score = torch.squeeze(last_score, 0)
    state = torch.squeeze(state, 0)
    hx = torch.squeeze(hx, 0)
    cx = torch.squeeze(cx, 0)
    score = torch.squeeze(score, 0)

    optim.zero_grad()
    model_pred.load_state_dict(model.state_dict())

    last_score_pred, cmd_pred, _ = model(last_state, last_hx, last_cx)
    score_pred, _, _ = model_pred(state, hx, cx)

    r = score - last_score
    r_pred = score_pred - last_score_pred

    loss = criterion((cmd_pred, r_pred), (cmd, r), score)
    loss.backward()

    if (i + 1) % 100 == 0:
        print('Iter:%d | loss:%.4f' % (i + 1, loss.item()))

    if (i + 1) % 1000 == 0:
Beispiel #3
0
    def train(self, epochs):
        self.cm = Cluster_Model(self.encoder.module)
        self.cm = nn.DataParallel(self.cm)
        assert self.encoder.module == self.cm.module.encoder
        optimizer = SGD(self.cm.parameters(),
                        lr=config.cluster_model_train_lr,
                        momentum=0.9)
        #  optimizer = Adam(params=self.dec.parameters())

        data_iterator = tqdm(
            self.dataloader,
            leave='True',
            unit='batch',
            postfix={
                'epoch': -1,
                #  'acc': '%.4f' % 0.0,
                'loss': '%.6f' % 0.0,
                'dlb': '%.4f' % 0.0,
            })
        km = KMeans(n_clusters=self.n_components,
                    n_init=max(20, self.n_hidden_features),
                    n_jobs=-1)
        self.cm.train()
        self.cm.to(config.device)
        features = []
        actual = []
        for index, batch in enumerate(data_iterator):
            if ((isinstance(batch, tuple) or isinstance(batch, list))
                    and len(batch) == 2):
                batch, value = batch
                actual.append(value)
            batch = batch.cuda(non_blocking=True)
            features.append(self.cm.module.encoder(batch).detach().cpu())
        actual = torch.cat(actual).long()
        predicted = km.fit_predict(torch.cat(features).numpy())
        predicted_previous = torch.tensor(np.copy(predicted), dtype=torch.long)

        cluster_centers = torch.tensor(km.cluster_centers_,
                                       dtype=torch.float,
                                       requires_grad=True)
        cluster_centers = cluster_centers.cuda(non_blocking=True)
        with torch.no_grad():
            self.cm.module.state_dict()['assignment.cluster_centers'].copy_(
                cluster_centers)
        loss_function = nn.KLDivLoss(size_average=False)
        delta_label = None
        for epoch in range(epochs):
            features = []
            data_iterator = tqdm(self.dataloader,
                                 leave='True',
                                 unit='batch',
                                 postfix={
                                     'epoch': epoch,
                                     'loss': '%.8f' % 0.0,
                                     'dlb': '%.4f' % (delta_label or 0.0)
                                 })
            self.cm.train()
            for index, batch in enumerate(data_iterator):
                if ((isinstance(batch, tuple) or isinstance(batch, list))
                        and len(batch) == 2):
                    batch, _ = batch
                batch = batch.cuda(non_blocking=True)
                output = self.cm(batch)
                target = target_distribution(output).detach()
                loss = loss_function(output.log(), target) / output.shape[0]
                data_iterator.set_postfix(epoch=epoch,
                                          loss='%.8f' % float(loss.item()),
                                          dlb='%.4f' % (delta_label or 0.0))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step(closure=None)
                features.append(self.cm.module.encoder(batch).detach().cpu())
                if index % 10 == 0:  # update_freq = 10
                    loss_value = float(loss.item())
                    data_iterator.set_postfix(
                        epoch=epoch,
                        loss='%.8f' % loss_value,
                        dlb='%.4f' % (delta_label or 0.0),
                    )
            predicted, actual = self.predict()
            delta_label = float(
                (predicted != predicted_previous
                 ).float().sum().item()) / predicted_previous.shape[0]
            if self.stopping_delta is not None and delta_label < self.stopping_delta:
                print(
                    'Early stopping as label delta "%1.5f" less tahn "%1.5f".'
                    % (delta_label, self.stopping_delta))
                break
            predicted_previous = predicted

        if (config.plot_clustering):
            self.plot_train(self.cm, self.n_components, epoch)
        self.encoder = self.cm.module.encoder
        print("training dec ended.")
def train_val_model(model: nn.Module,
                    dataloaders: dict,
                    criterion: nn.CrossEntropyLoss,
                    optimizer: optim.SGD,
                    num_epochs=num_epochs,
                    is_inception=False):
    '''
    :param model:
    :param dataloaders: dict ,包括了train和val两个dataloader
    :param criterion:
    :param optimizer:
    :param num_epochs:
    :param is_inception:
    :return:
    '''
    print("************* train_and_valid begined!")
    since = time.time()
    val_acc_history = []
    best_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    print('---Epoch train_and_valid begined')
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  #Set model to training mode
            else:
                model.eval()  #Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            #Iterate over data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter
                optimizer.zero_grad()

                with torch.set_grad_enabled(
                        phase == 'train'
                ):  # torch.set_grad_enabled(True) if (phase=='train')
                    if is_inception and phase == 'train':
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss2 = criterion(aux_outputs, labels)
                        loss = loss1 + 0.4 * loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                # todo *inputs.size(0)?  因为:The losses are averaged across observations for each minibatch
                # 详见 class CrossEntropyLoss(_WeightedLoss)的定义说明
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(
                dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:4f}'.format(phase, epoch_loss,
                                                      epoch_acc))
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)
        print('---Epoch {}/{} finished!'.format(epoch, num_epochs - 1))
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    # load best model weights
    model.load_state_dict(best_model_wts)
    print("************* train_and_valid finished!")
    return model, val_acc_history
Beispiel #5
0
def train_model(model: nn.Module,
                optimizer: optim.SGD,
                epochs: int,
                device: torch.device,
                train_dataloader: DataLoader,
                val_dataloader: DataLoader,
                logger: SummaryWriter,
                print_interval: int = 50):
    """

    :param model:
    :param optimizer:
    :param epochs:
    :param device:
    :param train_dataloader:
    :param val_dataloader: TODO: make optional
    :param logger: TODO: make optional
    :param print_interval: TODO: add as argument to argparser
    :return:
    """
    print("Training Model...")
    start = time.time()
    train_step, val_step = 0, 0
    for epoch in tqdm(range(epochs)):
        model.train()
        for batch_idx, (data, target) in enumerate(train_dataloader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            # train step + backward step
            logits = model(data)
            loss = F.cross_entropy(logits, target)
            loss.backward()
            optimizer.step()
            # sgd noise step
            # _, sgd_noise = get_sgd_noise(model, device, optimizer, query_dataloader)
            # print('noise shape', sgd_noise.shape)
            # noise_norm = torch.norm(sgd_noise, dim=1)
            # alpha_hat = estimate_alpha(sgd_noise)
            # print('grad_norm', noise_norm.shape)
            # print('alpha_hat', alpha_hat)
            # # start train step logging here
            # logger.add_scalar('train/loss', loss.item(), train_step)
            # # TODO: how to log noise norm tensor?
            # logger.add_scalar('train/alpha', alpha_hat)
            train_step += 1
            if not batch_idx % print_interval:
                print_train_step(epoch, epochs, batch_idx,
                                 len(train_dataloader), loss.item())
            # end of train step logging
        model.eval()
        with torch.no_grad():
            correct, samples = 0, 0
            for batch_idx, (data, target) in enumerate(val_dataloader):
                data, target = data.to(device), target.to(device)
                # validation step
                logits = model(data)
                target_hat = torch.argmax(logits, dim=1)
                val_loss = F.cross_entropy(logits, target)
                correct += torch.sum(target == target_hat).item()
                samples += len(target)
                # start val step logging here
                logger.add_scalar('Loss/val', val_loss.item(), val_step)
                val_step += 1
                # end of val step logging
        # scheduler.step(epoch)
        # start val epoch end logging here
        val_acc = correct / (samples * 1.0)
        logger.add_scalar('Acc/val', val_acc, epoch)
        print_validation_step(curr_epoch=epoch, epochs=epochs, val_acc=val_acc)
        # end val epoch logging here
    end = time.time()
    print('Total Training Time: %.2f min\n' % ((end - start) / 60))
Beispiel #6
0
                              shuffle=True)
    dev_loader = DataLoader(dev_data.to_numpy(), batch_size=32)

    num_epochs = 15
    model = MatrixFactorizer(N, M)
    optimizer = SGD(model.parameters(), lr=0.01, weight_decay=1e-2)

    train_losses, dev_losses = [], []
    for epoch in range(1, num_epochs + 1):
        epoch_train_loss = 0.0
        num_train_batches = 0
        num_dev_batches = 0
        epoch_dev_loss = 0.0
        model.train()
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            users = batch[:, 0].long()
            movies = batch[:, 1].long()
            ratings = (batch[:, 2] - rating_mean).float()
            ratings_guess = model(users, movies)
            batch_loss = torch.pow(ratings - ratings_guess, 2).mean()
            batch_loss.backward()
            optimizer.step()
            num_train_batches += 1
            epoch_train_loss += batch_loss.item()
        model.eval()
        for batch_idx, batch in enumerate(dev_loader):
            users = batch[:, 0].long()
            movies = batch[:, 1].long()
            ratings = (batch[:, 2] - rating_mean).float()
            with torch.no_grad():
def main():

    if not os.path.exists(args.outdir):
        os.mkdir(args.outdir)

    device = torch.device("cuda")
    torch.cuda.set_device(args.gpu)

    logfilename = os.path.join(args.outdir, args.logname)

    init_logfile(logfilename,
                 "epoch\ttime\tlr\ttrain loss\ttrain acc\ttestloss\ttest acc")
    log(logfilename, "Hyperparameter List")
    log(logfilename, "Epochs: {:}".format(args.epochs))
    log(logfilename, "Learning Rate: {:}".format(args.lr))
    log(logfilename, "Alpha: {:}".format(args.alpha))
    log(logfilename, "Keep ratio: {:}".format(args.keep_ratio))
    log(logfilename, "Warmup Epochs: {:}".format(args.epochs_warmup))

    test_acc_list = []
    for _ in range(args.round):
        traindir = os.path.join(args.data_train, 'train')
        valdir = os.path.join(args.data_val, 'val')
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        train_dataset = datasets.ImageFolder(
            traindir,
            transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]))

        train_sampler = None

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch,
            shuffle=(train_sampler is None),
            num_workers=args.workers,
            pin_memory=True,
            sampler=train_sampler)

        test_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
            valdir,
            transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ])),
                                                  batch_size=args.batch,
                                                  shuffle=False,
                                                  num_workers=args.workers,
                                                  pin_memory=True)

        base_classifier = models.__dict__[args.arch](pretrained=False).cuda()
        print("Loaded the base_classifier")

        criterion = nn.CrossEntropyLoss().to(device)
        optimizer = SGD(base_classifier.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay)

        # Warmup training for the rewinding.
        for epoch in range(args.epochs_warmup):
            print("Warmup Training Epochs: {:}".format(epoch))
            log(logfilename, "Warmup current epochs: {}".format(epoch))
            train_loss, train_top1, train_top5 = utils.train(train_loader,
                                                             base_classifier,
                                                             criterion,
                                                             optimizer,
                                                             epoch,
                                                             device,
                                                             print_freq=100,
                                                             display=True)

        original_acc = model_inference(base_classifier,
                                       test_loader,
                                       device,
                                       display=True)
        log(logfilename,
            "Warmup Model Test Accuracy: {:.5}".format(original_acc))
        print("Warmup Model Test Accuracy, ", original_acc)

        # Creating a fresh copy of network not affecting the original network.
        # Goal is to find the supermask.

        net = copy.deepcopy(base_classifier)
        net = net.to(device)

        # Generating the mask 'm'
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                layer.weight_mask = nn.Parameter(torch.ones_like(layer.weight))

                layer.weight.requires_grad = True
                layer.weight_mask.requires_grad = True

            # This is the monkey-patch overriding layer.forward to custom function.
            # layer.forward will pass nn.Linear with weights: 'w' and 'm' elementwised
            if isinstance(layer, nn.Linear):
                layer.forward = types.MethodType(mask_forward_linear, layer)

            if isinstance(layer, nn.Conv2d):
                layer.forward = types.MethodType(mask_forward_conv2d, layer)

        criterion = nn.CrossEntropyLoss().to(
            device)  # Criterion for training the mask.
        optimizer = SGD(net.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=0)
        # weight_decay = 0 for training the mask.
        #         warm_scheduler = StepLR(optimizer, step_size=args.epochs_mask-10, gamma=0.2)

        sparsity, total = 0, 0
        breakFlag = False
        net.train()
        # Training the mask with the training set.
        for epoch in range(100000):
            #             if epoch % 5 == 0:
            print("Current epochs: ", epoch)
            print("Sparsity: {:}".format(sparsity))
            log(logfilename, "Current epochs: {}".format(epoch))
            log(logfilename, "Sparsity: {:}".format(sparsity))

            for i, (inputs, targets) in enumerate(train_loader):
                inputs = inputs.cuda()
                targets = targets.cuda()

                reg_loss = 0
                for layer in net.modules():
                    if isinstance(layer, nn.Conv2d) or isinstance(
                            layer, nn.Linear):
                        reg_loss += torch.norm(layer.weight_mask, p=1)
                outputs = net(inputs)
                loss = criterion(outputs, targets) + args.alpha * reg_loss

                # Computing gradient and do SGD
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                sparsity, total = 0, 0
                for layer in net.modules():
                    if isinstance(layer, nn.Linear) or isinstance(
                            layer, nn.Conv2d):
                        boolean_list = layer.weight_mask.data > 1e-3
                        sparsity += (boolean_list == 1).sum()
                        total += layer.weight.numel()

                if i % 50 == 0:
                    print(
                        "Current Epochs: {}, Current i: {}, Current Sparsity: {}"
                        .format(epoch, i, sparsity))

                if sparsity <= total * args.keep_ratio:
                    print("Current epochs breaking loop at {:}".format(epoch))
                    log(logfilename,
                        "Current epochs breaking loop at {:}".format(epoch))
                    breakFlag = True
                    break
#                 if breakFlag == True:
#                     break
            if breakFlag == True:
                break


#                     print("W 1-norm: ", torch.norm(layer.weight_mask, p=1))

# Just checking the 1-norm of weights in each layer.
# Approximates how sparse the mask is..

# This line allows to calculate the threshold to satisfy the keep_ratio.
        c_abs = []
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                c_abs.append(torch.abs(layer.weight_mask))

        all_scores = torch.cat([torch.flatten(x) for x in c_abs])
        num_params_to_keep = int(len(all_scores) * args.keep_ratio)
        threshold, _ = torch.topk(all_scores, num_params_to_keep, sorted=True)
        threshold = threshold[-1]

        print("Threshold found: ", threshold)

        keep_masks = []
        for c in c_abs:
            keep_masks.append((c >= threshold).float())
        print(
            "Number of ones.",
            torch.sum(torch.cat([torch.flatten(x == 1) for x in keep_masks])))

        torch.save(base_classifier.state_dict(),
                   os.path.join(args.outdir, args.save_model))
        base_classifier_acc = model_inference(base_classifier,
                                              test_loader,
                                              device,
                                              display=True)
        log(logfilename,
            "Weight Update Test Accuracy: {:.5}".format(base_classifier_acc))
        print("Saved the rewind model.")
        for masks in keep_masks:
            masks = masks.data

        torch.save(keep_masks, os.path.join(args.outdir, args.keep_mask))
        print("Saved the masking function.")
        log(logfilename, "Finished finding the mask. (REWIND)")
Beispiel #8
0
def train(**kwargs):
    opt.parse(kwargs)
    alpha = [0.2,0.5,0.8,1.0,1.3.1.5.1.8.2.0,2.5]
    images, tags, labels = load_data(opt.data_path)
    pretrain_model = load_pretrain_model(opt.pretrain_model_path)
    y_dim = tags.shape[1]
    label_num = labels.shape[1]
    X, Y, L = split_data(images, tags, labels)
    print('...loading and splitting data finish')
    img_model = ImgModule(opt.bit, pretrain_model)
    txt_model = TxtModule(y_dim, opt.bit)
    hash_model = HashModule(opt.bit)
    label_model = LabModule(label_num)
    if opt.use_gpu:
        img_model = img_model.cuda()
        txt_model = txt_model.cuda()
        hash_model = hash_model.cuda()
        label_model = label_model.cuda()
    train_L = torch.from_numpy(L['train'])
    train_x = torch.from_numpy(X['train'])
    train_y = torch.from_numpy(Y['train'])

    query_L = torch.from_numpy(L['query'])
    query_x = torch.from_numpy(X['query'])
    query_y = torch.from_numpy(Y['query'])

    retrieval_L = torch.from_numpy(L['retrieval'])
    retrieval_x = torch.from_numpy(X['retrieval'])
    retrieval_y = torch.from_numpy(Y['retrieval'])

    num_train = train_x.shape[0]

    F_buffer = torch.randn(num_train, opt.bit)
    G_buffer = torch.randn(num_train, opt.bit)
    X_fea_buffer = torch.randn(num_train, opt.X_fea_nums)
    Y_fea_buffer = torch.randn(num_train,opt.Y_fea_nums)
    X_label_buffer = torch.randn(num_train, label_num)
    Y_label_buffer = torch.randn(num_train, label_num)
    
    Label_buffer = torch.randn(num_train, label_num)
    Label_hash_buffer = torch.randn(num_train, opt.bit)
    Label_label_buffer = torch.randn(num_train, label_num)
    
    if opt.use_gpu:
        train_L = train_L.cuda()
        F_buffer = F_buffer.cuda()
        G_buffer = G_buffer.cuda()
        X_fea_buffer = X_fea_buffer.cuda()
        Y_fea_buffer = Y_fea_buffer.cuda()
        Label_buffer = Label_buffer.cuda()
        X_label_buffer = X_label_buffer.cuda()
        Y_label_buffer =  Y_label_buffer.cuda()
        Label_hash_buffer = Label_hash_buffer.cuda()
        Label_label_buffer = Label_label_buffer.cuda()
    Sim = calc_neighbor(train_L, train_L)
    ###############ddddddd
    B = torch.sign(F_buffer + G_buffer)
    B_buffer = torch.sign(F_buffer + G_buffer)
    batch_size = opt.batch_size

    lr = opt.lr
    optimizer_img = SGD(img_model.parameters(), lr=lr)
    optimizer_txt = SGD(txt_model.parameters(), lr=lr)
    optimizer_hash = SGD(hash_model.parameters(), lr=lr)
    optimizer_label = SGD(label_model.parameters(), lr=lr)

    learning_rate = np.linspace(opt.lr, np.power(10, -6.), opt.max_epoch + 1)
    result = {
        'loss': [],
        'hash_loss' : [],
        'total_loss' : []
    }

    ones = torch.ones(batch_size, 1)
    ones_ = torch.ones(num_train - batch_size, 1)
    unupdated_size = num_train - batch_size

    max_mapi2t = max_mapt2i = 0.

    for epoch in range(opt.max_epoch):
        # train label net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0: batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)
            sample_L = Variable(train_L[ind, :])
            label = Variable(train_L[ind,:].unsqueeze(1).unsqueeze(-1).type(torch.float))
            if opt.use_gpu:
                label = label.cuda()
                sample_L = sample_L.cuda()
            # similar matrix size: (batch_size, num_train)
            S = calc_neighbor(sample_L, train_L)
            label_hash, label_label = label_model(label)  #
            Label_hash_buffer[ind, :] = label_hash.data
            Label_label_buffer[ind, :] = label_label.data
            Label = Variable(train_L)
            Label_B = torch.sign(label_hash)
            Label_H = Variable(Label_hash_buffer) 
            
            theta_l = 1.0 / 2 * torch.matmul(label_hash, Label_H.t())
            logloss_l = -torch.sum(S * theta_l - torch.log(1.0 + torch.exp(theta_l)))
            quantization_l = torch.sum(torch.pow(Label_hash_buffer[ind, :] - Label_B, 2))
            labelloss_l = torch.sum(torch.pow(Label[ind, :].float() - label_label, 2))
            loss_label = logloss_l + opt.beta * quantization_l + opt.alpha * labelloss_l  # + logloss_x_fea
            loss_label /= (batch_size * num_train)

            optimizer_label.zero_grad()
            loss_label.backward()
            optimizer_label.step()
        # train image net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0: batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)
            sample_L = Variable(train_L[ind, :])
            image = Variable(train_x[ind].type(torch.float))
            if opt.use_gpu:
                image = image.cuda()
                sample_L = sample_L.cuda()
            # similar matrix size: (batch_size, num_train)
            S = calc_neighbor(sample_L, train_L)  # S: (batch_size, num_train)
            image_fea, cur_f, image_label = img_model(image)  # cur_f: (batch_size, bit)
            X_fea_buffer[ind, :] = image_fea.data
            F_buffer[ind, :] = cur_f.data
            X_label_buffer[ind, :] = image_label.data
            G = Variable(G_buffer)
            H_l = Variable(Label_hash_buffer)
            B_x = torch.sign(F_buffer)

            theta_x = 1.0 / 2 * torch.matmul(cur_f, H_l.t())
            logloss_x = -torch.sum(S * theta_x - torch.log(1.0 + torch.exp(theta_x)))
            quantization_xh = torch.sum(torch.pow(B_buffer[ind, :] - cur_f, 2))
            quantization_xb = torch.sum(torch.pow(B_x[ind, :]- cur_f, 2))
            labelloss_x = torch.sum(torch.pow(train_L[ind, :].float() - image_label,2))
            loss_x = logloss_x + opt.beta * quantization_xh + opt.alpha * labelloss_x + opt.gamma * quantization_xb# + logloss_x_fea
            loss_x /= (batch_size * num_train)

            optimizer_img.zero_grad()
            loss_x.backward()
            optimizer_img.step()
        # train txt net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0: batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)
            sample_L = Variable(train_L[ind, :])
            text = train_y[ind, :].unsqueeze(1).unsqueeze(-1).type(torch.float)
            text = Variable(text)
            if opt.use_gpu:
                text = text.cuda()
                sample_L = sample_L.cuda()
            # similar matrix size: (batch_size, num_train)
            S = calc_neighbor(sample_L, train_L)  # S: (batch_size, num_train)
            txt_fea, cur_g, txt_label = txt_model(text)  # cur_f: (batch_size, bit)
            Y_fea_buffer[ind, :] = txt_fea.data
            G_buffer[ind, :] = cur_g.data
            Y_label_buffer[ind, :] = txt_label.data
            F = Variable(F_buffer)
            H_l = Variable(Label_hash_buffer)
            B_y = torch.sign(F)
            # calculate loss
            # theta_y: (batch_size, num_train)
            theta_y = 1.0 / 2 * torch.matmul(cur_g, H_l.t())
            logloss_y = -torch.sum(S * theta_y - torch.log(1.0 + torch.exp(theta_y)))
            quantization_yh = torch.sum(torch.pow(B_buffer[ind, :] - cur_g, 2))
            quantization_yb = torch.sum(torch.pow(B_y[ind, :] - cur_g, 2))
            labelloss_y = torch.sum(torch.pow(train_L[ind, :].float() - txt_label, 2))
            loss_y = logloss_y + opt.beta * quantization_yh + opt.alpha * labelloss_y + opt.gamma * quantization_yb# + logloss_y_fea
            loss_y /= (num_train * batch_size)
        
            optimizer_txt.zero_grad()
            loss_y.backward()
            optimizer_txt.step()

        #train hash net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0: batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)
            
            sample_L = Variable(train_L[ind, :])
            #W = norm(X_fea_buffer[ind, :], Y_fea_buffer[ind, :])
            #fea = 1.0 / 2 * (torch.matmul(W, X_fea_buffer[ind, :]) + torch.matmul(W, Y_fea_buffer[ind, :]))
            fea = torch.cat([X_fea_buffer[ind, :], Y_fea_buffer[ind, :]], dim=1)
            fea = Variable(fea)
            if opt.use_gpu:
                fea = fea.cuda()
                sample_L = sample_L.cuda()
            S = calc_neighbor(sample_L, train_L)
            A = caculateAdj(sample_L, sample_L)
            cur_B, label_hash = hash_model(fea, A)
            B_buffer[ind, :] = cur_B.data
            #caculate loss
            B = Variable(torch.sign(B_buffer))
            theta_hash = 1.0 / 2 * torch.matmul(cur_B, B_buffer.t())
            logloss_hash = -torch.sum(S * theta_hash - torch.log(1.0 + torch.exp(theta_hash)))
            label_loss = torch.sum(torch.pow(train_L[ind, :].float() - label_hash, 2))
            hashloss = torch.sum(torch.pow(B[ind, :] - cur_B, 2))
            loss_hash = logloss_hash + opt.alpha * label_loss + opt.beta * hashloss

            optimizer_hash.zero_grad()
            loss_hash.backward()
            optimizer_hash.step()
        # train image net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0: batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)
            sample_L = Variable(train_L[ind, :])
            image = Variable(train_x[ind].type(torch.float))
            if opt.use_gpu:
                image = image.cuda()
                sample_L = sample_L.cuda()
            # similar matrix size: (batch_size, num_train)
            S = calc_neighbor(sample_L, train_L)  # S: (batch_size, num_train)
            image_fea, cur_f, image_label = img_model(image)  # cur_f: (batch_size, bit)
            X_fea_buffer[ind, :] = image_fea.data
            F_buffer[ind, :] = cur_f.data
            X_label_buffer[ind, :] = image_label.data
            G = Variable(G_buffer)
            H_l = Variable(Label_hash_buffer)
            B_x = torch.sign(F_buffer)

            theta_x = 1.0 / 2 * torch.matmul(cur_f, H_l.t())
            logloss_x = -torch.sum(S * theta_x - torch.log(1.0 + torch.exp(theta_x)))
            quantization_xh = torch.sum(torch.pow(B_buffer[ind, :] - cur_f, 2))
            quantization_xb = torch.sum(torch.pow(B_x[ind, :] - cur_f, 2))
            labelloss_x = torch.sum(torch.pow(train_L[ind, :].float() - image_label, 2))
            loss_x = logloss_x + opt.gamma * quantization_xh + opt.alpha * labelloss_x + opt.beta * quantization_xb  # + logloss_x_fea
            loss_x /= (batch_size * num_train)

            optimizer_img.zero_grad()
            loss_x.backward()
            optimizer_img.step()
        # train txt net
        for i in tqdm(range(num_train // batch_size)):
            index = np.random.permutation(num_train)
            ind = index[0: batch_size]
            unupdated_ind = np.setdiff1d(range(num_train), ind)
            sample_L = Variable(train_L[ind, :])
            text = train_y[ind, :].unsqueeze(1).unsqueeze(-1).type(torch.float)
            text = Variable(text)
            if opt.use_gpu:
                text = text.cuda()
                sample_L = sample_L.cuda()
            # similar matrix size: (batch_size, num_train)
            S = calc_neighbor(sample_L, train_L)  # S: (batch_size, num_train)
            txt_fea, cur_g, txt_label = txt_model(text)  # cur_f: (batch_size, bit)
            Y_fea_buffer[ind, :] = txt_fea.data
            G_buffer[ind, :] = cur_g.data
            Y_label_buffer[ind, :] = txt_label.data
            F = Variable(F_buffer)
            H_l = Variable(Label_hash_buffer)
            B_y = torch.sign(F)
            # calculate loss
            # theta_y: (batch_size, num_train)
            theta_y = 1.0 / 2 * torch.matmul(cur_g, H_l.t())
            logloss_y = -torch.sum(S * theta_y - torch.log(1.0 + torch.exp(theta_y)))
            quantization_yh = torch.sum(torch.pow(B_buffer[ind, :] - cur_g, 2))
            quantization_yb = torch.sum(torch.pow(B_y[ind, :] - cur_g, 2))
            labelloss_y = torch.sum(torch.pow(train_L[ind, :].float() - txt_label, 2))
            loss_y = logloss_y + opt.gamma * quantization_yh + opt.alpha * labelloss_y + opt.beta * quantization_yb  # + logloss_y_fea
            loss_y /= (num_train * batch_size)

            optimizer_txt.zero_grad()
            loss_y.backward()
            optimizer_txt.step()

        # calculate total loss
        loss, hash_loss, total_loss = calc_loss(B, F, G, Variable(Sim), opt.alpha, opt.beta,Label_buffer, train_L, X_label_buffer,Y_label_buffer)

        print('...epoch: %3d, loss: %3.3f, lr: %f' % (epoch + 1, loss.data, lr))
        print('...epoch: %3d, hash_loss: %3.3f, lr: %f' % (epoch + 1, hash_loss.data, lr))
        print('...epoch: %3d, total_loss: %3.3f, lr: %f' % (epoch + 1, total_loss.data, lr))
        result['loss'].append(float(loss.data))
        result['hash_loss'].append(float(hash_loss.data))
        result['total_loss'].append(float(total_loss.data))

        if opt.valid:
            mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x, query_y, retrieval_y,
                                   query_L, retrieval_L)
            print('...epoch: %3d, valid MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (epoch + 1, mapi2t, mapt2i))
            if mapt2i >= max_mapt2i and mapi2t >= max_mapi2t:
                max_mapi2t = mapi2t
                max_mapt2i = mapt2i
                img_model.save(img_model.module_name + '.pth')
                txt_model.save(txt_model.module_name + '.pth')
                hash_model.save(hash_model.module_name+'.pth')

        lr = learning_rate[epoch + 1]

        # set learning rate
        for param in optimizer_img.param_groups:
            param['lr'] = lr
        for param in optimizer_txt.param_groups:
            param['lr'] = lr

    print('...training procedure finish')
    if opt.valid:
        print('   max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (max_mapi2t, max_mapt2i))
        result['mapi2t'] = max_mapi2t
        result['mapt2i'] = max_mapt2i
    else:
        mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x, query_y, retrieval_y,
                               query_L, retrieval_L)
        print('   max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i))
        result['mapi2t'] = mapi2t
        result['mapt2i'] = mapt2i

    write_result(result)
Beispiel #9
0
def train(args, model, tokenizer):
    """ Train the model """
    if xm.is_master_ordinal():
        tb_writer = SummaryWriterP(args.output_dir)

    def summary_write(*args, **kwargs):
        if xm.is_master_ordinal():
            tb_writer.add_scalar(*args, **kwargs)

    args.train_batch_size = args.per_gpu_train_batch_size  #* max(1, args.n_gpu)

    train_dataloader = build_dataloader(args, tokenizer)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if p.requires_grad and not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if p.requires_grad and any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # Scale learning rate to num cores
    #args.learning_rate = args.learning_rate * xm.xrt_world_size()
    if args.sgd:
        optimizer = SGD(optimizer_grouped_parameters, lr=args.learning_rate)
    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    warmup_steps = args.warmup_samples // (args.train_batch_size *
                                           xm.xrt_world_size())
    if args.lr_decay:
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=t_total)
    elif args.lr_cosine:
        scheduler = WarmupCosineWithHardRestartsSchedule(
            optimizer,
            warmup_steps=warmup_steps,
            t_total=t_total,
            cycles=args.num_train_epochs)
    else:
        scheduler = WarmupZeroSchedule(optimizer, warmup_steps=warmup_steps)

    # Train!
    tracker = xm.RateTracker()
    log_info("***** Running training *****")
    log_info("  Num Epochs = %d", args.num_train_epochs)
    log_info("  Instantaneous batch size per GPU = %d",
             args.per_gpu_train_batch_size)
    log_info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (xm.xrt_world_size() if args.local_rank != -1 else 1))
    log_info("  Gradient Accumulation steps = %d",
             args.gradient_accumulation_steps)
    log_info("  Total optimization steps = %d", t_total)

    try:
        with open(os.path.join(args.model_name_or_path, 'step.txt'), 'r') as c:
            global_step = int(c.readline())
    except OSError as e:
        global_step = 0

    moving_loss = MovingLoss(10000 // args.logging_steps)

    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=not xm.is_master_ordinal())
    try:
        for epoch in train_iterator:
            p_train_dataloader = pl.ParallelLoader(train_dataloader,
                                                   [args.device])
            epoch_iterator = tqdm(p_train_dataloader.per_device_loader(
                args.device),
                                  total=len(train_dataloader),
                                  desc="Iteration",
                                  disable=not xm.is_master_ordinal())

            model.train()
            for step, batch in enumerate(epoch_iterator):
                optimizer.zero_grad()
                inputs, labels = mask_tokens(
                    batch, tokenizer, args) if args.mlm else (batch, batch)
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                loss = outputs[
                    0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                    xm.optimizer_step(optimizer, barrier=True)
                    scheduler.step()
                    global_step += 1
                    tracker.add(args.train_batch_size)

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        ls = loss.item(
                        )  # weird. if you call loss.item() only in one process, the whole thing hangs. So call on every and log in one.
                        moving_loss.add(ls)
                        summary_write('lr',
                                      scheduler.get_last_lr()[0], global_step)
                        epoch_iterator.set_postfix(
                            MovingLoss=f'{moving_loss.loss:.2f}',
                            Perplexity=
                            f'{torch.exp(torch.tensor(moving_loss.loss)):.2f}')

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        save_state(args, model, tokenizer, global_step)

                if step >= 2:  # TPU seems to like consistent epoch lenght
                    if xm.is_master_ordinal():
                        print(met.metrics_report())
                    exit(0)

                #    epoch_iterator.close()
                #    break

                if args.max_steps > 0 and step > args.max_steps:
                    epoch_iterator.close()
                    break

            # evaluate once in an epoch
            if args.evaluate_during_training:
                results = evaluate(args, model, tokenizer,
                                   f"checkpoint-{global_step}")
                log_info(f"Eval {results}")
                for key, value in results.items():
                    summary_write("eval_{}".format(key), value, global_step)

            # reload dataset every args.reload_data_file epochs
            if args.reload_data_file and (epoch +
                                          1) % args.reload_data_file == 0:
                train_dataloader = build_dataloader(args, tokenizer)

            # that's very slow on TPU
            #print_sample(model, tokenizer, args.device, args)

    except (KeyboardInterrupt, SystemExit):
        save_state(args, model, tokenizer, global_step)
        raise

    save_state(args, model, tokenizer, global_step)

    return global_step, moving_loss.loss
Beispiel #10
0
    def compute(self, config, budget, **kwargs):
        """Runs the training session.

        This training session will also save all the data on its runs (e.g.
        config, loss, accuracy) into the logging dir

        Args:
            config (dict): Dictionary containing the configuration by the
                optimizer
            budget (int): Amount of epochs the model can use to train.

        Returns:
            dict: dictionary with fields 'loss' (float) and 'info' (dict)
        """
        # Start with printouts
        print("\n\n")
        print(
            "================================================================"
            "=======")
        print("\nStarting run {} with config:.".format(self.run_count))
        print("    Optimizer: {}".format(config['optimizer']))
        print("    Learning rate: {}".format(config['lr']))
        print("    Batch size: {}".format(config['bs']))
        print("    First layer: {}".format(config['first_layer']))
        print("    Second layer: {}".format(config['second_layer']))
        print("    Leaky config: {}, {}, {}".format(config['leaky1'],
                                                    config['leaky2'],
                                                    config['leaky3']))
        # Set network, dataloader, optimizer, and loss criterion
        train_loader = DataLoader(self.train_data, config['bs'], shuffle=True)
        test_loader = DataLoader(self.test_data, config['bs'], shuffle=True)

        network = FCNetwork(784, 10, config['first_layer'],
                            config['second_layer'],
                            (config['leaky1'], config['leaky2'],
                             config['leaky3'])).to(device=self.device)

        if config['optimizer'] == 'sgd':
            optimizer = SGD(network.parameters(), config['lr'],
                            config['momentum'])
        else:
            optimizer = Adam(network.parameters(),
                             config['lr'],
                             eps=config['epsilon'])
        loss_crit = CrossEntropyLoss()

        # Increment run count number
        self.run_count += 1

        # Start actual training loop
        for epoch in range(int(budget)):

            # Do training loop
            network.train()
            for i, (img, cls) in enumerate(train_loader):
                img = img.to(self.device)
                cls = cls.to(self.device)
                optimizer.zero_grad()
                h1, h2, out = network(img)
                out = out.softmax(1)
                loss = loss_crit(out, cls)

                # Do backprop
                if i % int(1000 / (config['bs'] / 4)) == 0:
                    print("Iteration {},    \tepoch: {}, \tLoss: {:.4f},  "
                          "\taccuracy: {:.2f}%".format(
                              i + 1, epoch + 1, loss.item(),
                              self.calc_batch_accuracy(out, cls) * 100))
                loss.backward()
                optimizer.step()

        train_loss, train_acc = self.evaluate_network(network, loss_crit,
                                                      train_loader)
        validation_loss, validation_accuracy = self.evaluate_network(
            network, loss_crit, test_loader)

        # Print out results
        print(
            "================================================================"
            "=======")
        print("Validation accuracy: {:.4f}%".format(validation_accuracy *
                                                    100.))
        print("Validation loss:     {:.4f}".format(validation_loss))
        print("Training accuracy:   {:.4f}%".format(train_acc * 100))
        print("Training loss:       {:.4f}".format(train_loss))

        return {
            'loss': 1 - validation_accuracy,
            'info': {
                'validation accuracy': validation_accuracy,
                'validation loss': validation_loss,
                'training loss': train_loss,
                'training accuracy': train_acc
            }
        }
Beispiel #11
0
def train(model,
          state,
          path,
          annotations,
          val_path,
          val_annotations,
          resize,
          max_size,
          jitter,
          batch_size,
          iterations,
          val_iterations,
          mixed_precision,
          lr,
          warmup,
          milestones,
          gamma,
          is_master=True,
          world=1,
          use_dali=True,
          verbose=True,
          metrics_url=None,
          logdir=None):
    'Train the model on the given dataset'

    # Prepare model
    nn_model = model
    stride = model.stride

    model = convert_fixedbn_model(model)
    if torch.cuda.is_available():
        model = model.cuda()

    # Setup optimizer and schedule
    optimizer = SGD(model.parameters(),
                    lr=lr,
                    weight_decay=0.0001,
                    momentum=0.9)

    model, optimizer = amp.initialize(
        model,
        optimizer,
        opt_level='O2' if mixed_precision else 'O0',
        keep_batchnorm_fp32=True,
        loss_scale=128.0,
        verbosity=is_master)

    if world > 1:
        model = DistributedDataParallel(model)
    model.train()

    if 'optimizer' in state:
        optimizer.load_state_dict(state['optimizer'])

    def schedule(train_iter):
        if warmup and train_iter <= warmup:
            return 0.9 * train_iter / warmup + 0.1
        return gamma**len([m for m in milestones if m <= train_iter])

    scheduler = LambdaLR(optimizer.optimizer if mixed_precision else optimizer,
                         schedule)

    # Prepare dataset
    if verbose: print('Preparing dataset...')
    data_iterator = (DaliDataIterator if use_dali else DataIterator)(
        path,
        jitter,
        max_size,
        batch_size,
        stride,
        world,
        annotations,
        training=True)
    if verbose: print(data_iterator)

    if verbose:
        print('    device: {} {}'.format(
            world, 'cpu' if not torch.cuda.is_available() else
            'gpu' if world == 1 else 'gpus'))
        print('    batch: {}, precision: {}'.format(
            batch_size, 'mixed' if mixed_precision else 'full'))
        print('Training model for {} iterations...'.format(iterations))

    # Create TensorBoard writer
    if logdir is not None:
        from tensorboardX import SummaryWriter
        if is_master and verbose:
            print('Writing TensorBoard logs to: {}'.format(logdir))
        writer = SummaryWriter(log_dir=logdir)

    profiler = Profiler(['train', 'fw', 'bw'])
    iteration = state.get('iteration', 0)
    while iteration < iterations:
        cls_losses, box_losses = [], []
        for i, (data, target) in enumerate(data_iterator):
            scheduler.step(iteration)

            # Forward pass
            profiler.start('fw')

            optimizer.zero_grad()
            cls_loss, box_loss = model([data, target])
            del data
            profiler.stop('fw')

            # Backward pass
            profiler.start('bw')
            with amp.scale_loss(cls_loss + box_loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()

            # Reduce all losses
            cls_loss, box_loss = cls_loss.mean().clone(), box_loss.mean(
            ).clone()
            if world > 1:
                torch.distributed.all_reduce(cls_loss)
                torch.distributed.all_reduce(box_loss)
                cls_loss /= world
                box_loss /= world
            if is_master:
                cls_losses.append(cls_loss)
                box_losses.append(box_loss)

            if is_master and not isfinite(cls_loss + box_loss):
                raise RuntimeError('Loss is diverging!\n{}'.format(
                    'Try lowering the learning rate.'))

            del cls_loss, box_loss
            profiler.stop('bw')

            iteration += 1
            profiler.bump('train')
            if is_master and (profiler.totals['train'] > 60
                              or iteration == iterations):
                focal_loss = torch.stack(list(cls_losses)).mean().item()
                box_loss = torch.stack(list(box_losses)).mean().item()
                learning_rate = optimizer.param_groups[0]['lr']
                if verbose:
                    msg = '[{:{len}}/{}]'.format(iteration,
                                                 iterations,
                                                 len=len(str(iterations)))
                    msg += ' focal loss: {:.3f}'.format(focal_loss)
                    msg += ', box loss: {:.3f}'.format(box_loss)
                    msg += ', {:.3f}s/{}-batch'.format(profiler.means['train'],
                                                       batch_size)
                    msg += ' (fw: {:.3f}s, bw: {:.3f}s)'.format(
                        profiler.means['fw'], profiler.means['bw'])
                    msg += ', {:.1f} im/s'.format(batch_size /
                                                  profiler.means['train'])
                    msg += ', lr: {:.2g}'.format(learning_rate)
                    print(msg, flush=True)

                if logdir is not None:
                    writer.add_scalar('focal_loss', focal_loss, iteration)
                    writer.add_scalar('box_loss', box_loss, iteration)
                    writer.add_scalar('learning_rate', learning_rate,
                                      iteration)
                    del box_loss, focal_loss

                if metrics_url:
                    post_metrics(
                        metrics_url, {
                            'focal loss': mean(cls_losses),
                            'box loss': mean(box_losses),
                            'im_s': batch_size / profiler.means['train'],
                            'lr': learning_rate
                        })

                # Save model weights
                state.update({
                    'iteration': iteration,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                })
                with ignore_sigint():
                    nn_model.save(state)

                profiler.reset()
                del cls_losses[:], box_losses[:]

            if val_annotations and (iteration == iterations
                                    or iteration % val_iterations == 0):
                infer(model,
                      val_path,
                      None,
                      resize,
                      max_size,
                      batch_size,
                      annotations=val_annotations,
                      mixed_precision=mixed_precision,
                      is_master=is_master,
                      world=world,
                      use_dali=use_dali,
                      verbose=False)
                model.train()

            if iteration == iterations:
                break

    if logdir is not None:
        writer.close()
Beispiel #12
0
class Trainer:
    def __init__(self,
                 frozen_src2tgt: Seq2Seq,
                 frozen_tgt2src: Seq2Seq,
                 src_embedding: Embedding,
                 tgt_embedding: Embedding,
                 encoder_rnn,
                 decoder_rnn,
                 attention: Attention,
                 src_hat: DecoderHat,
                 tgt_hat: DecoderHat,
                 discriminator: Discriminator,
                 src_sos_index,
                 tgt_sos_index,
                 src_eos_index,
                 tgt_eos_index,
                 src_pad_index,
                 tgt_pad_index,
                 device,
                 lr_core=1e-3,
                 lr_disc=1e-3):
        assert discriminator.hidden_size == (encoder_rnn.bidirectional +
                                             1) * encoder_rnn.hidden_size

        self.frozen_src2tgt = frozen_src2tgt
        self.frozen_tgt2src = frozen_tgt2src
        self.src_embedding = src_embedding
        self.tgt_embedding = tgt_embedding
        self.encoder_rnn = encoder_rnn
        self.decoder_rnn = decoder_rnn
        self.attention = attention
        self.src_hat = src_hat
        self.tgt_hat = tgt_hat
        self.core_model = nn.ModuleList([
            self.src_embedding, self.tgt_embedding, self.encoder_rnn,
            self.decoder_rnn, self.attention, self.src_hat, self.tgt_hat
        ])
        self.discriminator = discriminator
        self.src_sos_index = src_sos_index
        self.tgt_sos_index = tgt_sos_index
        self.src_eos_index = src_eos_index
        self.tgt_eos_index = tgt_eos_index
        self.src_pad_index = src_pad_index
        self.tgt_pad_index = tgt_pad_index
        self.device = device

        self.core_model.to(device)
        self.discriminator.to(device)

        use_cuda = device.type == 'cuda'
        self.src2src = Seq2Seq(src_embedding, encoder_rnn, src_embedding,
                               attention, decoder_rnn, src_hat, use_cuda)
        self.src2tgt = Seq2Seq(src_embedding, encoder_rnn, tgt_embedding,
                               attention, decoder_rnn, tgt_hat, use_cuda)
        self.tgt2tgt = Seq2Seq(tgt_embedding, encoder_rnn, tgt_embedding,
                               attention, decoder_rnn, tgt_hat, use_cuda)
        self.tgt2src = Seq2Seq(tgt_embedding, encoder_rnn, src_embedding,
                               attention, decoder_rnn, src_hat, use_cuda)

        self.core_optimizer = SGD(self.core_model.parameters(), lr=lr_core)
        self.discriminator_optimizer = SGD(self.discriminator.parameters(),
                                           lr=lr_disc)

    def train_step(self,
                   batch,
                   weights=(1, 1, 1),
                   drop_probability=0.1,
                   permutation_constraint=3):
        batch = {l: t.to(self.device) for l, t in batch.items()}

        src2src_dec, src2src_enc = self.src2src(
            noise(batch['src'], self.src_pad_index, drop_probability,
                  permutation_constraint), self.src_sos_index, batch['src'])
        tgt2tgt_dec, tgt2tgt_enc = self.tgt2tgt(
            noise(batch['tgt'], self.tgt_pad_index, drop_probability,
                  permutation_constraint), self.tgt_sos_index, batch['tgt'])
        tgt2src_dec, tgt2src_enc = self.tgt2src(
            noise(self.frozen_src2tgt(batch['src']), self.tgt_pad_index,
                  drop_probability, permutation_constraint),
            self.src_sos_index, batch['src'])
        src2tgt_dec, src2tgt_enc = self.src2tgt(
            noise(self.frozen_tgt2src(batch['tgt']), self.src_pad_index,
                  drop_probability, permutation_constraint),
            self.tgt_sos_index, batch['tgt'])

        # autoencoding
        core_loss = weights[0] * (translation_loss(src2src_dec, batch['src']) +
                                  translation_loss(tgt2tgt_dec, batch['tgt']))

        # translating
        core_loss += weights[1] * (
            translation_loss(tgt2src_dec, batch['src']) +
            translation_loss(src2tgt_dec, batch['tgt']))

        # beating discriminator
        core_loss += weights[2] * (
            classification_loss(self.discriminator(src2src_enc), 'tgt') +
            classification_loss(self.discriminator(tgt2tgt_enc), 'src') +
            classification_loss(self.discriminator(tgt2src_enc), 'src') +
            classification_loss(self.discriminator(src2tgt_enc), 'tgt'))

        # training discriminator
        discriminator_loss = classification_loss(self.discriminator(src2src_enc), 'src') + \
                             classification_loss(self.discriminator(tgt2tgt_enc), 'tgt') + \
                             classification_loss(self.discriminator(tgt2src_enc), 'tgt') + \
                             classification_loss(self.discriminator(src2tgt_enc), 'src')

        # update core model's parameters
        self.core_optimizer.zero_grad()
        core_loss.backward(retain_graph=True)
        self.core_optimizer.step()

        # update discriminator parameters
        self.discriminator_optimizer.zero_grad()
        discriminator_loss.backward()
        self.discriminator_optimizer.step()
Beispiel #13
0
    def test_hybrid_batch_gradients(self, qnn_type: str):
        """Test gradient back-prop for batch input in a qnn."""
        import torch
        from torch.nn import MSELoss
        from torch.optim import SGD

        qnn: Optional[Union[CircuitQNN, TwoLayerQNN]] = None
        if qnn_type == "opflow":
            qnn = self._create_opflow_qnn()
            output_size = 1
        elif qnn_type == "circuit_qnn":
            qnn = self._create_circuit_qnn()
            output_size = 2
        else:
            raise ValueError("Unsupported QNN type")

        model = self._create_network(qnn, output_size=output_size)
        model.to(self._device)

        # random data set
        x = torch.rand((5, 4), device=self._device)
        y = torch.rand((5, 2), device=self._device)

        # define optimizer and loss
        optimizer = SGD(model.parameters(), lr=0.1)
        f_loss = MSELoss(reduction="sum")

        # loss and gradients without batch
        optimizer.zero_grad(set_to_none=True)
        sum_of_individual_losses = 0.0
        for x_i, y_i in zip(x, y):
            output = model(x_i)
            sum_of_individual_losses += f_loss(output, y_i)
        cast(torch.Tensor, sum_of_individual_losses).backward()
        sum_of_individual_gradients = 0.0
        for n, param in model.named_parameters():
            # make sure gradient is not None
            self.assertFalse(param.grad is None)
            if n.endswith(".weight"):
                sum_of_individual_gradients += np.sum(param.grad.detach().cpu().numpy())

        # loss and gradients with batch
        optimizer.zero_grad(set_to_none=True)
        output = model(x)
        batch_loss = f_loss(output, y)
        batch_loss.backward()
        batch_gradients = 0.0
        for n, param in model.named_parameters():
            # make sure gradient is not None
            self.assertFalse(param.grad is None)
            if n.endswith(".weight"):
                batch_gradients += np.sum(param.grad.detach().cpu().numpy())

        # making sure they are equivalent
        self.assertAlmostEqual(
            cast(float, np.linalg.norm(sum_of_individual_gradients - batch_gradients)),
            0.0,
            places=4,
        )

        self.assertAlmostEqual(
            cast(torch.Tensor, sum_of_individual_losses).detach().cpu().numpy(),
            batch_loss.detach().cpu().numpy(),
            places=4,
        )
Beispiel #14
0
def main(
    lsun_data_dir: ('Base directory for the LSUN data'),
    image_output_prefix: ('Prefix for image output', 'option', 'o') = 'glo',
    code_dim: ('Dimensionality of latent representation space', 'option', 'd',
               int) = 128,
    epochs: ('Number of epochs to train', 'option', 'e', int) = 25,
    use_cuda: ('Use GPU?', 'flag', 'gpu') = False,
    batch_size: ('Batch size', 'option', 'b', int) = 128,
    lr_g: ('Learning rate for generator', 'option', None, float) = 1.,
    lr_z: ('Learning rate for representation_space', 'option', None,
           float) = 10.,
    max_num_samples: ('Cap on the number of samples from the LSUN dataset',
                      'option', 'n', int) = -1,
    init: ('Initialization strategy for latent represetation vectors',
           'option', 'i', str, ['pca', 'random']) = 'pca',
    n_pca: ('Number of samples to take for PCA', 'option', None,
            int) = (64 * 64 * 3 * 2),
    loss: ('Loss type (Laplacian loss as in the paper, or L2 loss)', 'option',
           'l', str, ['lap_l1', 'l2']) = 'lap_l1',
):
    def maybe_cuda(tensor):
        return tensor.cuda() if use_cuda else tensor

    train_set = IndexedDataset(
        LSUN(lsun_data_dir,
             classes=['bedroom_train'],
             transform=transforms.Compose([
                 transforms.Resize(64),
                 transforms.CenterCrop(64),
                 transforms.ToTensor(),
                 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
             ])))
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=8,
        pin_memory=use_cuda,
    )
    # we don't really have a validation set here, but for visualization let us
    # just take the first couple images from the dataset
    val_loader = torch.utils.data.DataLoader(train_set,
                                             shuffle=False,
                                             batch_size=8 * 8)

    if max_num_samples > 0:
        train_set.base.length = max_num_samples
        train_set.base.indices = [max_num_samples]

    # initialize representation space:
    if init == 'pca':
        from sklearn.decomposition import PCA

        # first, take a subset of train set to fit the PCA
        X_pca = np.vstack([
            X.cpu().numpy().reshape(len(X), -1) for i, (X, _, _) in zip(
                tqdm(range(n_pca // train_loader.batch_size),
                     'collect data for PCA'), train_loader)
        ])
        print("perform PCA...")
        pca = PCA(n_components=code_dim)
        pca.fit(X_pca)
        # then, initialize latent vectors to the pca projections of the complete dataset
        Z = np.empty((len(train_loader.dataset), code_dim))
        for X, _, idx in tqdm(train_loader, 'pca projection'):
            Z[idx] = pca.transform(X.cpu().numpy().reshape(len(X), -1))

    elif init == 'random':
        Z = np.random.randn(len(train_set), code_dim)

    Z = project_l2_ball(Z)

    g = maybe_cuda(Generator(code_dim))  # initial a Generator g
    loss_fn = LapLoss(max_levels=3) if loss == 'lap_l1' else nn.MSELoss()
    zi = maybe_cuda(torch.zeros((batch_size, code_dim)))
    zi = Variable(zi, requires_grad=True)
    optimizer = SGD([{
        'params': g.parameters(),
        'lr': lr_g
    }, {
        'params': zi,
        'lr': lr_z
    }])

    Xi_val, _, idx_val = next(iter(val_loader))
    imsave(
        'target.png',
        make_grid(Xi_val.cpu() / 2. + 0.5, nrow=8).numpy().transpose(1, 2, 0))

    for epoch in range(epochs):
        losses = []
        progress = tqdm(total=len(train_loader), desc='epoch % 3d' % epoch)

        for i, (Xi, yi, idx) in enumerate(train_loader):
            Xi = Variable(maybe_cuda(Xi))
            zi.data = maybe_cuda(torch.FloatTensor(Z[idx.numpy()]))

            optimizer.zero_grad()
            rec = g(zi)
            loss = loss_fn(rec, Xi)
            loss.backward()
            optimizer.step()

            Z[idx.numpy()] = project_l2_ball(zi.data.cpu().numpy())

            losses.append(loss.data[0])
            progress.set_postfix({'loss': np.mean(losses[-100:])})
            progress.update()

        progress.close()

        # visualize reconstructions
        rec = g(Variable(maybe_cuda(torch.FloatTensor(Z[idx_val.numpy()]))))
        imsave(
            '%s_rec_epoch_%03d.png' % (image_output_prefix, epoch),
            make_grid(rec.data.cpu() / 2. + 0.5,
                      nrow=8).numpy().transpose(1, 2, 0))
class AlexNet(nn.Module):
    def __init__(self, num_classes, verbose=False):
        super(AlexNet, self).__init__()
        self.verbose = verbose
        self.num_classes = num_classes
        self.training = False

        self.convolution = None
        self.classifier = None

        self.criterion = None
        self.optimizer = None
        self.scheduler = None
        """
        The first convolutional layer filters the 224x224x3 input image 
        with 96 kernels of size 11x11x3 with a stride of 4 pixels
        (Note: the actual image size is 227x227x3)
        """
        # Since the 2nd layer has an input of 55x55x48
        # (n_h - k_h + 2*p_h)/s_h + 1 = (227 - 11 + 0)/4 + 1 = 55
        self.C1 = nn.Conv2d(in_channels=3,
                            out_channels=96,
                            kernel_size=(11, 11),
                            stride=4)
        """
        The second convolutional layer takes as input the (response-normalized and pooled) 
        output of the first layer and filters it with 256 kernels of size 5x5x48.
        """
        # We used k = 2, n = 5, alpha = 10^-4, and beta = 0.75.
        # We applied this normalization after applying the ReLU non-linearity in certain layers
        self.RN2 = nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2)
        # (n_h - k_h + 2*p_h)/s_h + 1 = (55 - 3 + 0)/2 + 1 = 27
        self.P2 = nn.MaxPool2d(kernel_size=(3, 3), stride=2)
        # (n_h - k_h + 2*p_h)/s_h + 1  = (27 - 5 + 2*2)/1 + 1 = 27
        self.C2 = nn.Conv2d(in_channels=96,
                            out_channels=256,
                            kernel_size=(5, 5),
                            padding=(2, 2))
        """
        The third convolutional layer has 384 kernels of size 3x3x256 connected to 
        the (normalized, pooled) outputs of the second convolutional layer. 
        """
        self.RN3 = nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2)
        # (n_h - k_h + 2*p_h)/s_h + 1 = (27 - 3 + 0)/2 + 1 = 13
        self.P3 = nn.MaxPool2d(kernel_size=(3, 3), stride=2)
        # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 2*1)/1 + 1 = 13
        self.C3 = nn.Conv2d(in_channels=256,
                            out_channels=384,
                            kernel_size=(3, 3),
                            padding=(1, 1))
        """
        The fourth convolutional layer has 384 kernels of size 3x3x192
        """
        # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 2*1)/1 + 1 = 13
        self.C4 = nn.Conv2d(in_channels=384,
                            out_channels=384,
                            kernel_size=(3, 3),
                            padding=(1, 1))
        """
        The fifth convolutional layer has 256 kernels of size 3x3x192
        """
        # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 2*1)/1 + 1 = 13
        self.C5 = nn.Conv2d(in_channels=384,
                            out_channels=256,
                            kernel_size=(3, 3),
                            padding=(1, 1))
        # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 0)/2 + 1 = 6
        self.P5 = nn.MaxPool2d(kernel_size=(3, 3), stride=2)
        """
        The fully-connected layers have 4096 neurons each
        """
        self.F6 = nn.Linear(in_features=(256 * 6 * 6), out_features=4096)
        self.F7 = nn.Linear(in_features=4096, out_features=4096)
        self.F8 = nn.Linear(in_features=4096, out_features=self.num_classes)

        self.convolution = nn.Sequential(self.C1, self.C2,
                                         nn.ReLU(inplace=True),
                                         self.RN2, self.P2, self.C3,
                                         nn.ReLU(inplace=True), self.RN3,
                                         self.P3, self.C4, self.C5, self.P5)
        """
        The ReLU non-linearity is applied to the output of every convolutional
            and fully-connected layer.
        Dropout is used in the first two fully-connected layers, consisting of 
            setting to zero the output of each hidden neuron with probability 0.5.
        """
        self.classifier = nn.Sequential(nn.Dropout(p=0.5), self.F6,
                                        nn.ReLU(inplace=True),
                                        nn.Dropout(p=0.5), self.F7,
                                        nn.ReLU(inplace=True), self.F8)

    def initialize(self,
                   criterion=None,
                   optimizer=None,
                   scheduler=None,
                   learning_rate=0.01) -> None:
        if criterion is None:
            self.criterion = nn.CrossEntropyLoss()
        else:
            self.criterion = criterion
        """
        We trained our models using stochastic gradient descent with a batch size of 128 examples,
            momentum of 0.9, and weight decay of 0.0005.
        We used an equal learning rate for all layers, which we adjusted manually throughout training.
        The heuristic which we followed was to divide the learning rate by 10 when the validation 
            error rate stopped improving with the current learning rate. 
        The learning rate was initialized at 0.01 and reduced three times prior to termination.
        """
        if optimizer is None:
            self.optimizer = SGD(self.parameters(),
                                 lr=learning_rate,
                                 momentum=0.9,
                                 weight_decay=0.0005)
        else:
            self.optimizer = optimizer

        if scheduler is None:
            self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer=self.optimizer,
                mode='min',
                factor=0.1,
                patience=5,
                threshold=0.002)
        else:
            self.scheduler = scheduler
        """
        We initialized the weights in each layer from a zero-mean Gaussian distribution 
            with standard deviation 0.01.
        We initialized the neuron biases in the second, fourth, and fifth convolutional layers, 
            as well as in the fully-connected hidden layers, with the constant 1.
        We initialized the neuron biases in the remaining layers with the constant 0.
        """
        for name, module in self.convolution.named_children():
            if type(module) == nn.Conv2d:
                nn.init.normal_(tensor=module.weight, mean=0, std=0.01)
                if name in ['0', '2']:
                    nn.init.constant_(tensor=module.bias, val=0)
                else:
                    nn.init.constant_(tensor=module.bias, val=1)

        for name, module in self.classifier.named_children():
            if type(module) == nn.Linear:
                nn.init.normal_(tensor=module.weight, mean=0, std=0.01)
                nn.init.constant_(tensor=module.bias, val=1)

    def forward(self, X: torch.Tensor) -> torch.Tensor:
        X = self.convolution(X)
        X = X.view(-1, 256 * 6 * 6)
        return self.classifier(X)

    def train(self, mode=True, data=None, epochs=10) -> 'AlexNet':
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.to(device)

        if data is None:
            raise FileNotFoundError(
                "\"data\" has to be a valid Dataloader object!")

        self.training = mode
        for module in self.convolution:
            module.train(mode)
        for module in self.classifier:
            module.train(mode)

        running_loss = 0.0
        for epoch in range(0, epochs):
            for i, datum in enumerate(data, 0):
                features, labels = datum[0].to(device), datum[1].to(device)
                loss = self.criterion(self(features), labels)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()
                batch_split = int(len(data.dataset) / data.batch_size / 5)
                batch_split = 1 if batch_split < 1 else batch_split
                if i % batch_split == batch_split - 1:
                    if self.verbose:
                        print(
                            f"[epoch {epoch + 1}, batch {i + 1}] loss: {running_loss / batch_split}"
                        )
                    self.scheduler.step(running_loss / batch_split)
                    running_loss = 0.0

        if self.verbose:
            print('Finished Training')
        return self
Beispiel #16
0
def train():
    args = parse_args()
    args.decay_lrs = cfg.TRAIN.DECAY_LRS

    cfg.USE_GPU_NMS = True if args.use_cuda else False

    assert args.batch_size == 1, 'Only support single batch'

    lr = cfg.TRAIN.LEARNING_RATE
    momentum = cfg.TRAIN.MOMENTUM
    weight_decay = cfg.TRAIN.WEIGHT_DECAY
    gamma = cfg.TRAIN.GAMMA

    # initial tensorboardX writer
    if args.use_tfboard:
        if args.exp_name == 'default':
            writer = SummaryWriter()
        else:
            writer = SummaryWriter('runs/' + args.exp_name)

    if args.dataset == 'voc07trainval':
        args.imdb_name = 'voc_2007_trainval'
        args.imdbval_name = 'voc_2007_test'

    elif args.dataset == 'voc0712trainval':
        args.imdb_name = 'voc_2007_trainval+voc_2012_trainval'
        args.imdbval_name = 'voc_2007_test'
    else:
        raise NotImplementedError

    if args.net == 'res50':
        fname = 'resnet50-caffe.pth'
    elif args.net == 'res101':
        fname = 'resnet101-caffe.pth'
    else:
        raise NotImplementedError

    args.pretrained_model = os.path.join('data', 'pretrained', fname)

    output_dir = args.output_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # dataset_cachefile = os.path.join(output_dir, 'dataset.pickle')
    # if not os.path.exists(dataset_cachefile):
    #     imdb, roidb = combined_roidb(args.imdb_name)
    #     cache = [imdb, roidb]
    #     with open(dataset_cachefile, 'wb') as f:
    #         pickle.dump(cache, f)
    #     print('save dataset cache')
    # else:
    #     with open(dataset_cachefile, 'rb') as f:
    #         cache = pickle.load(f)
    #         imdb, roidb = cache[0], cache[1]
    #         print('loaded dataset from cache')

    imdb, roidb = combined_roidb(args.imdb_name)

    train_dataset = RoiDataset(roidb)
    train_dataloader = DataLoader(train_dataset, args.batch_size, shuffle=True)

    model = FasterRCNN(backbone=args.net, pretrained=args.pretrained_model)
    print('model loaded')

    # if cfg.PRETRAINED_RPN:
    #     rpn_model_path = 'output/rpn.pth'
    #     model.load_state_dict(torch.load(rpn_model_path)['model'])
    #     print('loaded rpn!')

    # optimizer
    params = []
    for key, value in dict(model.named_parameters()).items():
        if value.requires_grad:
            if 'bias' in key:
                params += [{'params': [value], 'lr': lr * (cfg.TRAIN.DOUBLE_BIAS + 1), \
                            'weight_decay': cfg.TRAIN.BIAS_DECAY and weight_decay or 0}]
            else:
                params += [{
                    'params': [value],
                    'lr': lr,
                    'weight_decay': weight_decay
                }]

    optimizer = SGD(params, momentum=momentum)

    if args.use_cuda:
        model = model.cuda()

    model.train()

    iters_per_epoch = int(len(train_dataset) / args.batch_size)

    # start training
    for epoch in range(args.start_epoch, args.max_epochs + 1):
        loss_temp = 0
        rpn_tp, rpn_tn, rpn_fg, rpn_bg = 0, 0, 0, 0
        rcnn_tp, rcnn_tn, rcnn_fg, rcnn_bg = 0, 0, 0, 0
        tic = time.time()
        train_data_iter = iter(train_dataloader)

        if epoch in args.decay_lrs:
            lr = lr * gamma
            adjust_learning_rate(optimizer, lr)
            print('adjust learning rate to {}'.format(lr))

        for step in range(iters_per_epoch):
            im_data, gt_boxes, im_info = next(train_data_iter)
            if args.use_cuda:
                im_data = im_data.cuda()
                gt_boxes = gt_boxes.cuda()
                im_info = im_info.cuda()

            im_data_variable = Variable(im_data)

            output = model(im_data_variable, gt_boxes, im_info)
            rois, _, _, \
            rcnn_cls_loss, rcnn_box_loss, \
            rpn_cls_loss, rpn_box_loss, _train_info = output

            loss = rcnn_cls_loss.mean() + rcnn_box_loss.mean() +\
                   rpn_cls_loss.mean() + rpn_box_loss.mean()

            optimizer.zero_grad()

            loss.backward()
            optimizer.step()

            loss_temp += loss.item()

            if cfg.VERBOSE:
                rpn_tp += _train_info['rpn_tp']
                rpn_tn += _train_info['rpn_tn']
                rpn_fg += _train_info['rpn_num_fg']
                rpn_bg += _train_info['rpn_num_bg']
                rcnn_tp += _train_info['rcnn_tp']
                rcnn_tn += _train_info['rcnn_tn']
                rcnn_fg += _train_info['rcnn_num_fg']
                rcnn_bg += _train_info['rcnn_num_bg']

            if (step + 1) % args.display_interval == 0:
                toc = time.time()
                loss_temp /= args.display_interval
                rpn_cls_loss_v = rpn_cls_loss.mean().item()
                rpn_box_loss_v = rpn_box_loss.mean().item()
                rcnn_cls_loss_v = rcnn_cls_loss.mean().item()
                rcnn_box_loss_v = rcnn_box_loss.mean().item()

                print("[epoch %2d][step %4d/%4d] loss: %.4f, lr: %.2e, time cost %.1fs" \
                      % (epoch, step+1, iters_per_epoch, loss_temp, lr, toc - tic))
                print("\t\t\t rpn_cls_loss_v: %.4f, rpn_box_loss_v: %.4f\n\t\t\t "
                      "rcnn_cls_loss_v: %.4f, rcnn_box_loss_v: %.4f" \
                      % (rpn_cls_loss_v, rpn_box_loss_v, rcnn_cls_loss_v, rcnn_box_loss_v))
                if cfg.VERBOSE:
                    print('\t\t\t RPN : [FG/BG] [%d/%d], FG: %.4f, BG: %.4f' %
                          (rpn_fg, rpn_bg, float(rpn_tp) / rpn_fg,
                           float(rpn_tn) / rpn_bg))
                    print('\t\t\t RCNN: [FG/BG] [%d/%d], FG: %.4f, BG: %.4f' %
                          (rcnn_fg, rcnn_bg, float(rcnn_tp) / rcnn_fg,
                           float(rcnn_tn) / rcnn_bg))

                if args.use_tfboard:
                    n_iter = (epoch - 1) * iters_per_epoch + step + 1
                    writer.add_scalar('losses/loss', loss_temp, n_iter)
                    writer.add_scalar('losses/rpn_cls_loss_v', rpn_cls_loss_v,
                                      n_iter)
                    writer.add_scalar('losses/rpn_box_loss_v', rpn_box_loss_v,
                                      n_iter)
                    writer.add_scalar('losses/rcnn_cls_loss_v',
                                      rcnn_cls_loss_v, n_iter)
                    writer.add_scalar('losses/rcnn_box_loss_v',
                                      rcnn_box_loss_v, n_iter)

                    if cfg.VERBOSE:
                        writer.add_scalar('rpn/fg_acc',
                                          float(rpn_tp) / rpn_fg, n_iter)
                        writer.add_scalar('rpn/bg_acc',
                                          float(rpn_tn) / rpn_bg, n_iter)
                        writer.add_scalar('rcnn/fg_acc',
                                          float(rcnn_tp) / rcnn_fg, n_iter)
                        writer.add_scalar('rcnn/bg_acc',
                                          float(rcnn_tn) / rcnn_bg, n_iter)

                loss_temp = 0
                rpn_tp, rpn_tn, rpn_fg, rpn_bg = 0, 0, 0, 0
                rcnn_tp, rcnn_tn, rcnn_fg, rcnn_bg = 0, 0, 0, 0
                tic = time.time()

        if epoch % args.save_interval == 0:
            save_name = os.path.join(
                output_dir, 'faster_{}_epoch_{}.pth'.format(args.net, epoch))
            torch.save({
                'model': model.state_dict(),
                'epoch': epoch,
                'lr': lr
            }, save_name)
Beispiel #17
0
class TD3Agent(AgentType):
    """
    Twin Delayed Deep Deterministic (TD3) Policy Gradient.

    Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise.
    """

    name = "TD3"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 hidden_layers: Sequence[int] = (128, 128),
                 actor_lr: float = 1e-3,
                 critic_lr: float = 1e-3,
                 noise_scale: float = 0.2,
                 noise_sigma: float = 0.1,
                 clip: Tuple[int, int] = (-1, 1),
                 config=None,
                 device=None,
                 **kwargs):
        config = config if config is not None else dict()
        self.device = device if device is not None else DEVICE

        # Reason sequence initiation.
        self.hidden_layers = config.get('hidden_layers', hidden_layers)
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers).to(self.device)
        self.critic = DoubleCritic(state_size,
                                   action_size,
                                   hidden_layers=hidden_layers).to(self.device)
        self.target_actor = ActorBody(state_size,
                                      action_size,
                                      hidden_layers=hidden_layers).to(
                                          self.device)
        self.target_critic = DoubleCritic(state_size,
                                          action_size,
                                          hidden_layers=hidden_layers).to(
                                              self.device)

        # Noise sequence initiation
        self.noise = GaussianNoise(shape=(action_size, ),
                                   mu=1e-8,
                                   sigma=noise_sigma,
                                   scale=noise_scale,
                                   device=device)

        # Target sequence initiation
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Optimization sequence initiation.
        self.actor_optimizer = SGD(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = SGD(self.critic.parameters(), lr=critic_lr)
        self.action_min = clip[0]
        self.action_max = clip[1]
        self.action_scale = config.get('action_scale', 1)

        self.gamma: float = float(config.get('gamma', 0.99))
        self.tau: float = float(config.get('tau', 0.02))
        self.batch_size: int = int(config.get('batch_size', 64))
        self.buffer_size: int = int(config.get('buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up: int = int(config.get('warm_up', 0))
        self.update_freq: int = int(config.get('update_freq', 1))
        self.update_policy_freq: int = int(config.get('update_policy_freq', 1))
        self.number_updates: int = int(config.get('number_updates', 1))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.critic.reset_parameters()
        self.target_actor.reset_parameters()
        self.target_critic.reset_parameters()

    def act(self, obs, noise: float = 0.0):
        with torch.no_grad():
            obs = torch.tensor(obs.astype(np.float32)).to(self.device)
            action = self.actor(obs)
            action += noise * self.noise.sample()
            return self.action_scale * torch.clamp(
                action, self.action_min, self.action_max).cpu().numpy().astype(
                    np.float32)

    def target_act(self, obs, noise: float = 0.0):
        with torch.no_grad():
            obs = torch.tensor(obs).to(self.device)
            action = self.target_actor(obs) + noise * self.noise.sample()
            return torch.clamp(action, self.action_min,
                               self.action_max).cpu().numpy().astype(
                                   np.float32)

    def step(self, state, action, reward, next_state, done):
        self.iteration += 1
        self.buffer.add(state=state,
                        action=action,
                        reward=reward,
                        next_state=next_state,
                        done=done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (self.iteration %
                                                   self.update_freq) == 0:
            for _ in range(self.number_updates):
                # Note: Inside this there's a delayed policy update.
                #       Every `update_policy_freq` it will learn `number_updates` times.
                self.learn(self.buffer.sample_sars())

    def learn(self, samples):
        """update the critics and actors of all the agents """

        states, actions, rewards, next_states, dones = samples
        rewards = rewards.to(self.device)
        dones = dones.type(torch.int).to(self.device)
        states = states.to(self.device)
        next_states = next_states.to(self.device)
        actions = actions.to(self.device)

        self._update_value_function(states, actions, rewards, next_states,
                                    dones)

        if (self.iteration % self.update_policy_freq) == 0:
            self._update_policy(states)

            soft_update(self.target_actor, self.actor, self.tau)
            soft_update(self.target_critic, self.critic, self.tau)

    def _update_value_function(self, states, actions, rewards, next_states,
                               dones):
        # critic loss
        next_actions = self.target_actor.act(next_states)
        Q_target_next = torch.min(
            *self.target_critic.act(next_states, next_actions))
        Q_target = rewards + (self.gamma * Q_target_next * (1 - dones))
        Q1_expected, Q2_expected = self.critic(states, actions)
        critic_loss = mse_loss(Q1_expected, Q_target) + mse_loss(
            Q2_expected, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip)
        self.critic_optimizer.step()
        self.critic_loss = critic_loss.item()

    def _update_policy(self, states):
        # Compute actor loss
        pred_actions = self.actor(states)
        actor_loss = -self.critic(states, pred_actions)[0].mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.actor_loss = actor_loss.item()

    def describe_agent(self) -> Tuple[Any, Any, Any, Any]:
        """
        Returns network's weights in order:
        Actor, TargetActor, Critic, TargetCritic
        """
        return (self.actor.state_dict(), self.target_actor.state_dict(),
                self.critic.state_dict(), self.target_critic())

    def log_writer(self, writer, episode):
        writer.add_scalar("loss/actor", self.actor_loss, episode)
        writer.add_scalar("loss/critic", self.critic_loss, episode)

    def save_state(self, path: str):
        agent_state = dict(
            actor=self.actor.state_dict(),
            target_actor=self.target_actor.state_dict(),
            critic=self.critic.state_dict(),
            target_critic=self.target_critic.state_dict(),
        )
        torch.save(agent_state, path)

    def load_state(self, path: str):
        agent_state = torch.load(path)
        self.actor.load_state_dict(agent_state['actor'])
        self.critic.load_state_dict(agent_state['critic'])
        self.target_actor.load_state_dict(agent_state['target_actor'])
        self.target_critic.load_state_dict(agent_state['target_critic'])
        allcd = (neg_dist - pos_dist < margin).cpu().numpy().flatten()
        hard_triplets = np.where(allcd == 1)
        anc_hard_embedding = anc_embedding[hard_triplets].cuda()
        pos_hard_embedding = pos_embedding[hard_triplets].cuda()
        neg_hard_embedding = neg_embedding[hard_triplets].cuda()

        triplet_loss = TripletLoss(margin=margin).forward(
            anchor=anc_hard_embedding,
            positive=pos_hard_embedding,
            negative=neg_hard_embedding).cuda()

        triplet_loss_sum += triplet_loss.item()
        num_valid_training_triplets += len(anc_hard_embedding)

        optimizer_model.zero_grad()
        triplet_loss.backward()
        optimizer_model.step()

    avg_triplet_loss = 0 if (
        num_valid_training_triplets
        == 0) else triplet_loss_sum / num_valid_training_triplets

    print(
        'Epoch {}:\tAverage Triplet Loss: {:.4f}\tNumber of valid training triplets in epoch: {}'
        .format(epoch + 1, avg_triplet_loss, num_valid_training_triplets))

torch.save(
    {
        'epoch': epoch,
        'model_state_dict': net.state_dict(),
Beispiel #19
0
def train(cont=False):

    # for tensorboard tracking
    logger = get_logger()
    logger.info("(1) Initiating Training ... ")
    logger.info("Training on device: {}".format(device))
    writer = SummaryWriter()

    # init model
    aux_layers = None
    if net == "SETR-PUP":
        aux_layers, model = get_SETR_PUP()
    elif net == "SETR-MLA":
        aux_layers, model = get_SETR_MLA()
    elif net == "TransUNet-Base":
        model = get_TransUNet_base()
    elif net == "TransUNet-Large":
        model = get_TransUNet_large()
    elif net == "UNet":
        model = UNet(CLASS_NUM)

    # prepare dataset
    cluster_model = get_clustering_model(logger)
    train_dataset = CityscapeDataset(img_dir=data_dir,
                                     img_dim=IMG_DIM,
                                     mode="train",
                                     cluster_model=cluster_model)
    valid_dataset = CityscapeDataset(img_dir=data_dir,
                                     img_dim=IMG_DIM,
                                     mode="val",
                                     cluster_model=cluster_model)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)

    logger.info("(2) Dataset Initiated. ")

    # optimizer
    epochs = epoch_num if epoch_num > 0 else iteration_num // len(
        train_loader) + 1
    optim = SGD(model.parameters(),
                lr=lrate,
                momentum=momentum,
                weight_decay=wdecay)
    # optim = Adam(model.parameters(), lr=lrate)
    scheduler = lr_scheduler.MultiStepLR(
        optim, milestones=[int(epochs * fine_tune_ratio)], gamma=0.1)

    cur_epoch = 0
    best_loss = float('inf')
    epochs_since_improvement = 0

    # for continue training
    if cont:
        model, optim, cur_epoch, best_loss = load_ckpt_continue_training(
            best_ckpt_src, model, optim, logger)
        logger.info("Current best loss: {0}".format(best_loss))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            for i in range(cur_epoch):
                scheduler.step()
    else:
        model = nn.DataParallel(model)
        model = model.to(device)

    logger.info("(3) Model Initiated ... ")
    logger.info("Training model: {}".format(net) + ". Training Started.")

    # loss
    ce_loss = CrossEntropyLoss()
    if use_dice_loss:
        dice_loss = DiceLoss(CLASS_NUM)

    # loop over epochs
    iter_count = 0
    epoch_bar = tqdm.tqdm(total=epochs,
                          desc="Epoch",
                          position=cur_epoch,
                          leave=True)
    logger.info("Total epochs: {0}. Starting from epoch {1}.".format(
        epochs, cur_epoch + 1))

    for e in range(epochs - cur_epoch):
        epoch = e + cur_epoch

        # Training.
        model.train()
        trainLossMeter = LossMeter()
        train_batch_bar = tqdm.tqdm(total=len(train_loader),
                                    desc="TrainBatch",
                                    position=0,
                                    leave=True)

        for batch_num, (orig_img, mask_img) in enumerate(train_loader):
            orig_img, mask_img = orig_img.float().to(
                device), mask_img.float().to(device)

            if net == "TransUNet-Base" or net == "TransUNet-Large":
                pred = model(orig_img)
            elif net == "SETR-PUP" or net == "SETR-MLA":
                if aux_layers is not None:
                    pred, _ = model(orig_img)
                else:
                    pred = model(orig_img)
            elif net == "UNet":
                pred = model(orig_img)

            loss_ce = ce_loss(pred, mask_img[:].long())
            if use_dice_loss:
                loss_dice = dice_loss(pred, mask_img, softmax=True)
                loss = 0.5 * (loss_ce + loss_dice)
            else:
                loss = loss_ce

            # Backward Propagation, Update weight and metrics
            optim.zero_grad()
            loss.backward()
            optim.step()

            # update learning rate
            for param_group in optim.param_groups:
                orig_lr = param_group['lr']
                param_group['lr'] = orig_lr * (1.0 -
                                               iter_count / iteration_num)**0.9
            iter_count += 1

            # Update loss
            trainLossMeter.update(loss.item())

            # print status
            if (batch_num + 1) % print_freq == 0:
                status = 'Epoch: [{0}][{1}/{2}]\t' \
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch+1, batch_num+1, len(train_loader), loss=trainLossMeter)
                logger.info(status)

            # log loss to tensorboard
            if (batch_num + 1) % tensorboard_freq == 0:
                writer.add_scalar(
                    'Train_Loss_{0}'.format(tensorboard_freq),
                    trainLossMeter.avg,
                    epoch * (len(train_loader) / tensorboard_freq) +
                    (batch_num + 1) / tensorboard_freq)
            train_batch_bar.update(1)

        writer.add_scalar('Train_Loss_epoch', trainLossMeter.avg, epoch)

        # Validation.
        model.eval()
        validLossMeter = LossMeter()
        valid_batch_bar = tqdm.tqdm(total=len(valid_loader),
                                    desc="ValidBatch",
                                    position=0,
                                    leave=True)
        with torch.no_grad():
            for batch_num, (orig_img, mask_img) in enumerate(valid_loader):
                orig_img, mask_img = orig_img.float().to(
                    device), mask_img.float().to(device)

                if net == "TransUNet-Base" or net == "TransUNet-Large":
                    pred = model(orig_img)
                elif net == "SETR-PUP" or net == "SETR-MLA":
                    if aux_layers is not None:
                        pred, _ = model(orig_img)
                    else:
                        pred = model(orig_img)
                elif net == "UNet":
                    pred = model(orig_img)

                loss_ce = ce_loss(pred, mask_img[:].long())
                if use_dice_loss:
                    loss_dice = dice_loss(pred, mask_img, softmax=True)
                    loss = 0.5 * (loss_ce + loss_dice)
                else:
                    loss = loss_ce

                # Update loss
                validLossMeter.update(loss.item())

            # print status
            if (batch_num + 1) % print_freq == 0:
                status = 'Validation: [{0}][{1}/{2}]\t' \
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch+1, batch_num+1, len(valid_loader), loss=validLossMeter)
                logger.info(status)

            # log loss to tensorboard
            if (batch_num + 1) % tensorboard_freq == 0:
                writer.add_scalar(
                    'Valid_Loss_{0}'.format(tensorboard_freq),
                    validLossMeter.avg,
                    epoch * (len(valid_loader) / tensorboard_freq) +
                    (batch_num + 1) / tensorboard_freq)
            valid_batch_bar.update(1)

        valid_loss = validLossMeter.avg
        writer.add_scalar('Valid_Loss_epoch', valid_loss, epoch)
        logger.info("Validation Loss of epoch [{0}/{1}]: {2}\n".format(
            epoch + 1, epochs, valid_loss))

        # update optim scheduler
        scheduler.step()

        # save checkpoint
        is_best = valid_loss < best_loss
        best_loss_tmp = min(valid_loss, best_loss)
        if not is_best:
            epochs_since_improvement += 1
            logger.info("Epochs since last improvement: %d\n" %
                        (epochs_since_improvement, ))
            if epochs_since_improvement == early_stop_tolerance:
                break  # early stopping.
        else:
            epochs_since_improvement = 0
            state = {
                'epoch': epoch,
                'loss': best_loss_tmp,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optim.state_dict(),
            }
            torch.save(state, ckpt_src)
            logger.info("Checkpoint updated.")
            best_loss = best_loss_tmp
        epoch_bar.update(1)
    writer.close()
Beispiel #20
0
def main():
    if not os.path.exists(args.outdir):
        os.mkdir(args.outdir)

    device = torch.device("cuda")
    torch.cuda.set_device(args.gpu)

    logfilename = os.path.join(args.outdir, args.logname)

    init_logfile(logfilename, "epoch\ttime\tlr\ttrain loss\ttrain acc\ttestloss\ttest acc")
    log(logfilename, "Hyperparameter List")
    log(logfilename, "Epochs: {:}".format(args.epochs))
    log(logfilename, "Learning Rate: {:}".format(args.lr))
    log(logfilename, "Alpha: {:}".format(args.alpha))
    log(logfilename, "Keep ratio: {:}".format(args.keep_ratio))

    test_acc_list = []
    for _ in range(args.round):
        traindir = os.path.join(args.data_train, 'train')
        valdir = os.path.join(args.data_val, 'val')
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        train_dataset = datasets.ImageFolder(
            traindir,
            transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]))

        train_sampler = None

        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=args.batch, shuffle=(train_sampler is None),
            num_workers=args.workers, pin_memory=True, sampler=train_sampler)

        test_loader = torch.utils.data.DataLoader(
            datasets.ImageFolder(valdir, transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ])),
            batch_size=args.batch, shuffle=False,
            num_workers=args.workers, pin_memory=True)


        base_classifier = models.__dict__[args.arch](pretrained=True).cuda()
        print("Loaded the base_classifier")

        original_acc = model_inference(base_classifier, test_loader,
                                       device, display=True)
        log(logfilename, "Original Model Test Accuracy: {:.5}".format(original_acc))
        print("Original Model Test Accuracy, ", original_acc)

        # Creating a fresh copy of network not affecting the original network.
        net = copy.deepcopy(base_classifier)
        net = net.to(device)


        # Generating the mask 'm'
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                layer.weight_mask = nn.Parameter(torch.ones_like(layer.weight))

                layer.weight.requires_grad = True
                layer.weight_mask.requires_grad = True

            # This is the monkey-patch overriding layer.forward to custom function.
            # layer.forward will pass nn.Linear with weights: 'w' and 'm' elementwised
            if isinstance(layer, nn.Linear):
                layer.forward = types.MethodType(mask_forward_linear, layer)

            if isinstance(layer, nn.Conv2d):
                layer.forward = types.MethodType(mask_forward_conv2d, layer)


        criterion = nn.CrossEntropyLoss().to(device)    # I added Log Softmax layer to all architecture.
        optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                        weight_decay=0) # weight_decay = 0 for training the mask.
 
        sparsity, total = 0, 0
        breakFlag = False
        net.train()
        # Training the mask with the training set.
        for epoch in range(100000):
#             if epoch % 5 == 0:
            print("Current epochs: ", epoch)
            print("Sparsity: {:}".format(sparsity))
            log(logfilename, "Current epochs: {}".format(epoch))
            log(logfilename, "Sparsity: {:}".format(sparsity))
            
                
            for i, (inputs, targets) in enumerate(train_loader):
                inputs = inputs.cuda()
                targets = targets.cuda()

                reg_loss = 0
                for layer in net.modules():
                    if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
                        reg_loss += torch.norm(layer.weight_mask, p=1)
                outputs = net(inputs)
                loss = criterion(outputs, targets) + args.alpha * reg_loss
                
                # Computing gradient and do SGD
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
#                 if i % 50000 == 0:
#                     print("Entered 50000 loop")
#                     log(logfilename, "Entered 50000 loop")

                sparsity, total = 0, 0
                for layer in net.modules():
                    if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                        boolean_list = layer.weight_mask.data > 1e-3
                        sparsity += (boolean_list == 1).sum()
                        total += layer.weight.numel()
                
                if i % 50 == 0:
                    print("Current Epochs: {}, Current i: {}, Current Sparsity: {}".format(epoch, i, sparsity))
                
                if sparsity <= total*args.keep_ratio:
                    print("Current epochs breaking loop at {:}".format(epoch))
                    log(logfilename, "Current epochs breaking loop at {:}".format(epoch))
                    breakFlag = True
                    break
#                 if breakFlag == True:
#                     break
            if breakFlag == True:
                break
            

        # This line allows to calculate the threshold to satisfy the keep_ratio.
        c_abs = []
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                c_abs.append(torch.abs(layer.weight_mask))
        
        all_scores = torch.cat([torch.flatten(x) for x in c_abs])
        num_params_to_keep = int(len(all_scores) * args.keep_ratio)
        threshold, _ = torch.topk(all_scores, num_params_to_keep, sorted=True)
        threshold = threshold[-1]
        
        print("Threshold found: ", threshold)
        
        keep_masks = []
        for c in c_abs:
            keep_masks.append((c >= threshold).float())
        print("Number of ones.", torch.sum(torch.cat([torch.flatten(x == 1) for x in keep_masks])))
        
        # Updating the weight with elementwise product of update c.
        for layer in net.modules():
            if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
                # We update the weight by elementwise multiplication between
                # weight 'w' and mask 'm'.
                layer.weight.data = layer.weight.data * layer.weight_mask.data
                layer.zeros = nn.Parameter(torch.zeros_like(layer.weight))    # Dummy parameter.
                layer.ones = nn.Parameter(torch.ones_like(layer.weight))      # Dummy parameter.
                layer.weight_mask.data = torch.where(torch.abs(layer.weight_mask) <= threshold,
                                                layer.zeros,
                                                layer.ones)    # Updated weight_mask becomes the mask with element
                                                               # 0 and 1 again.

                # Temporarily disabling the backprop for both 'w' and 'm'.
                layer.weight.requires_grad = False
                layer.weight_mask.requires_grad = False

            if isinstance(layer, nn.Linear):
                layer.forward = types.MethodType(mask_forward_linear, layer)

            if isinstance(layer, nn.Conv2d):
                layer.forward = types.MethodType(mask_forward_conv2d, layer)

#        --------------------------------
        # We need to transfer the weight we learned from "net" to "base_classifier".
        for (layer1, layer2) in zip(base_classifier.modules(), net.modules()):
            if isinstance(layer1, (nn.Linear, nn.Conv2d)) or isinstance(layer2, (nn.Linear, nn.Conv2d)):
                layer1.weight.data = layer2.weight.data
                if layer1.bias != None:
                    layer1.bias.data = layer2.bias.data
                    layer1.bias.requires_grad = True

                layer1.weight.requires_grad = True
                
        

        torch.save(base_classifier.state_dict(), os.path.join(args.outdir, args.save_model))
        base_classifier_acc = model_inference(base_classifier, test_loader, device, display=True)
        log(logfilename, "Weight Update Test Accuracy: {:.5}".format(base_classifier_acc))
        print("Saved the finetune model.")
        for masks in keep_masks:
            masks = masks.data
            
        torch.save(keep_masks, os.path.join(args.outdir, args.keep_mask))
        print("Saved the masking function.")
        log(logfilename, "Finished finding the mask. (FINETUNE)")
Beispiel #21
0
def train(train_source_iter: ForeverDataIterator,
          train_target_iter: ForeverDataIterator, model: ImageClassifier,
          adaptive_feature_norm: AdaptiveFeatureNorm, optimizer: SGD,
          epoch: int, args: argparse.Namespace):
    batch_time = AverageMeter('Time', ':3.1f')
    data_time = AverageMeter('Data', ':3.1f')
    cls_losses = AverageMeter('Cls Loss', ':3.2f')
    norm_losses = AverageMeter('Norm Loss', ':3.2f')
    src_feature_norm = AverageMeter('Source Feature Norm', ':3.2f')
    tgt_feature_norm = AverageMeter('Target Feature Norm', ':3.2f')
    cls_accs = AverageMeter('Cls Acc', ':3.1f')
    tgt_accs = AverageMeter('Tgt Acc', ':3.1f')

    progress = ProgressMeter(args.iters_per_epoch, [
        batch_time, data_time, cls_losses, norm_losses, src_feature_norm,
        tgt_feature_norm, cls_accs, tgt_accs
    ],
                             prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i in range(args.iters_per_epoch):
        x_s, labels_s = next(train_source_iter)
        x_t, labels_t = next(train_target_iter)

        x_s = x_s.to(device)
        x_t = x_t.to(device)
        labels_s = labels_s.to(device)
        labels_t = labels_t.to(device)

        # measure data loading time
        data_time.update(time.time() - end)

        # compute output
        y_s, f_s = model(x_s)
        y_t, f_t = model(x_t)

        # classification loss
        cls_loss = F.cross_entropy(y_s, labels_s)
        # norm loss
        norm_loss = adaptive_feature_norm(f_s) + adaptive_feature_norm(f_t)
        loss = cls_loss + norm_loss * args.trade_off_norm

        # using entropy minimization
        if args.trade_off_entropy:
            y_t = F.softmax(y_t, dim=1)
            entropy_loss = entropy(y_t, reduction='mean')
            loss += entropy_loss * args.trade_off_entropy

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update statistics
        cls_acc = accuracy(y_s, labels_s)[0]
        tgt_acc = accuracy(y_t, labels_t)[0]

        cls_losses.update(cls_loss.item(), x_s.size(0))
        norm_losses.update(norm_loss.item(), x_s.size(0))
        src_feature_norm.update(
            f_s.norm(p=2, dim=1).mean().item(), x_s.size(0))
        tgt_feature_norm.update(
            f_t.norm(p=2, dim=1).mean().item(), x_s.size(0))
        cls_accs.update(cls_acc.item(), x_s.size(0))
        tgt_accs.update(tgt_acc.item(), x_s.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
Beispiel #22
0
def train():
    # Load data
    data_set_path = path.join(path.abspath(path.dirname(__file__)), '../resources/eclipse-data-set.csv')
    data = np.genfromtxt(data_set_path, delimiter=';', skip_header=1, usecols=[1, 2, 3, 4, 5, 6])
    np.random.shuffle(data)

    n_rows = data.shape[0]
    train_rows = int(n_rows*0.8)
    test_rows = int((n_rows - train_rows)/2)

    # x_input = Variable(Tensor(data[:, 3].reshape((-1, 1))))  # third column is linear entropy
    # x_input = Variable(Tensor([[1.0], [2.0], [3.0]]))
    # y_truth = Variable(Tensor([[2.0], [4.0], [6.0]]))

    epochs = 2001
    criterion = MSELoss()

    entropy_map = {
        1: 'full_not_decayed',
        2: 'weighted_not_decayed',
        3: 'full_linear_decayed',
        4: 'full_log_decayed',
        5: 'full_exp_decayed'
    }

    learing_rate_map = {
        1: (0.0000005, 0.000001),
        2: (0.0001, 0.005),
        3: (0.0001, 0.005),
        4: (0.0001, 0.005),
        5: (0.0001, 0.005)
    }
    for entropy_col, entropy_type in entropy_map.items():
        y_train = Variable(Tensor(data[:train_rows, 0].reshape((-1, 1))))  # first column is number of bugs
        x_train = Variable(Tensor(data[:train_rows, entropy_col].reshape((-1, 1))))  # third column is weighted entropy
        y_val = Variable(Tensor(data[train_rows+1:train_rows+test_rows, 0].reshape((-1, 1))))  # first column is number of bugs
        x_val = Variable(Tensor(data[train_rows+1:train_rows+test_rows, entropy_col].reshape((-1, 1))))  # third column is weighted entropy
        y_test = Variable(Tensor(data[train_rows+test_rows+1:, 0].reshape((-1, 1))))  # first column is number of bugs
        x_test = Variable(Tensor(data[train_rows+test_rows+1:, entropy_col].reshape((-1, 1))))  # third column is weighted entropy

        model_file_name = entropy_type + '_hcm'
        model_dir = path.normpath(path.join(path.abspath(path.dirname(__file__)), '../resources/models'))
        model_file_path = path.join(model_dir, model_file_name + '.pt')

        learing_rates = np.linspace(*learing_rate_map[entropy_col])

        # Try to load model
        old_model = LinearRegressionModel(1, 1)
        try:
            old_model.load_state_dict(load(model_file_path))
        except FileNotFoundError:
            print('File="{}" was not found. Create new model.'.format(model_file_path))
        y_test_pred_old = old_model(x_test)
        y_test_loss_old = criterion(y_test_pred_old, y_test)

        for learing_rate in learing_rates:
            train_loss_list = []
            val_loss_list = []
            test_loss_list = []

            print('Train for entropy type="{0}" and learning rate="{1}"'.format(entropy_type, learing_rate))
            model = LinearRegressionModel(1, 1)

            optimizer = SGD(model.parameters(), lr=learing_rate)
            for epoch in range(epochs):

                # Forward pass: Compute predicted y by passing
                # x to the model
                y_predicted = model(x_train)

                # Compute and print loss
                train_loss = criterion(y_predicted, y_train)
                train_loss_list.append(float(train_loss.data))

                # Val loss
                y_val_pred = model(x_val)
                val_loss = criterion(y_val_pred, y_val)
                val_loss_list.append(float(val_loss.data))

                # Zero
                optimizer.zero_grad()
                train_loss.backward()
                optimizer.step()

                y_test_pred = model(x_test)
                test_loss = criterion(y_test_pred, y_test)
                test_loss_list.append(float(test_loss.data))

                if epoch % 50 == 0:
                    print('epoch {0}, loss {1}'.format(epoch, train_loss.data))
                    if test_loss.data < y_test_loss_old.data:
                        print('Save model with prediction error {}.'.format(test_loss))
                        save(model.state_dict(), model_file_path)
                        meta_data = {
                            'history_complexity_metric': entropy_type,
                            'learning_rate': learing_rate,
                            'val_loss': val_loss_list,
                            'train_loss': train_loss_list,
                            'test_loss': test_loss_list
                        }

                        with open(path.join(model_dir, model_file_name + '_meta.json'), 'w') as write_file:
                            json.dump(meta_data, write_file, indent=4)

                        y_test_loss_old = test_loss

    # Compute and print loss
    test_loss = criterion(y_test_pred, y_test)
    print('Test loss {0}'.format(test_loss.data))

    train_plt, = plt.plot(range(epochs), train_loss_list, label='Train Loss')
    val_plt, = plt.plot(range(epochs), val_loss_list, label='Validation Loss')
    plt.xlabel('Number of Epochs')
    plt.ylabel('Loss')
    plt.legend(handles=[train_plt, val_plt])
    plt.show()
Beispiel #23
0
def train(start_path, beta):

    # prepare hyper-parameters

    seed = 42

    cuda_enabled = True
    cuda_deterministic = False

    batch_size = 2048
    num_workers = 2

    shared = False

    stochastic = False
    kkt_momentum = 0.0
    create_graph = False
    grad_correction = False
    shift = 0.0
    tol = 1e-5
    damping = 0.1
    maxiter = 50

    lr = 0.1
    momentum = 0.0
    weight_decay = 0.0

    num_steps = 10

    verbose = False

    # prepare path

    ckpt_name = start_path.name.split('.')[0]
    root_path = Path(__file__).resolve().parent
    dataset_path = root_path / 'MultiMNIST'
    ckpt_path = root_path / 'cpmtl' / ckpt_name

    if not start_path.is_file():
        raise RuntimeError('Pareto solutions not found.')

    root_path.mkdir(parents=True, exist_ok=True)
    dataset_path.mkdir(parents=True, exist_ok=True)
    ckpt_path.mkdir(parents=True, exist_ok=True)

    # fix random seed

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda_enabled and torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # prepare device

    if cuda_enabled and torch.cuda.is_available():
        import torch.backends.cudnn as cudnn
        device = torch.device('cuda')
        if cuda_deterministic:
            cudnn.benchmark = False
            cudnn.deterministic = True
        else:
            cudnn.benchmark = True
    else:
        device = torch.device('cpu')

    # prepare dataset

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    trainset = MultiMNIST(dataset_path,
                          train=True,
                          download=True,
                          transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=num_workers)

    testset = MultiMNIST(dataset_path,
                         train=False,
                         download=True,
                         transform=transform)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=num_workers)

    # prepare network

    network = MultiLeNet()
    network.to(device)

    # initialize network

    start_ckpt = torch.load(start_path, map_location='cpu')
    network.load_state_dict(start_ckpt['state_dict'])

    # prepare losses

    criterion = F.cross_entropy
    closures = [
        lambda n, l, t: criterion(l[0], t[:, 0]),
        lambda n, l, t: criterion(l[1], t[:, 1])
    ]

    # prepare HVP solver

    hvp_solver = VisionHVPSolver(network,
                                 device,
                                 trainloader,
                                 closures,
                                 shared=shared)
    hvp_solver.set_grad(batch=False)
    hvp_solver.set_hess(batch=True)

    # prepare KKT solver

    kkt_solver = MINRESKKTSolver(network,
                                 hvp_solver,
                                 device,
                                 stochastic=stochastic,
                                 kkt_momentum=kkt_momentum,
                                 create_graph=create_graph,
                                 grad_correction=grad_correction,
                                 shift=shift,
                                 tol=tol,
                                 damping=damping,
                                 maxiter=maxiter)

    # prepare optimizer

    optimizer = SGD(network.parameters(),
                    lr=lr,
                    momentum=momentum,
                    weight_decay=weight_decay)

    # first evaluation

    losses, tops = evaluate(network, testloader, device, closures,
                            f'{ckpt_name}')

    # prepare utilities
    top_trace = TopTrace(len(closures))
    top_trace.print(tops, show=False)

    beta = beta.to(device)

    # training

    for step in range(1, num_steps + 1):

        network.train(True)
        optimizer.zero_grad()
        kkt_solver.backward(beta, verbose=verbose)
        optimizer.step()

        losses, tops = evaluate(network, testloader, device, closures,
                                f'{ckpt_name}: {step}/{num_steps}')

        top_trace.print(tops)

        ckpt = {
            'state_dict': network.state_dict(),
            'optimizer': optimizer.state_dict(),
            'beta': beta,
        }
        record = {'losses': losses, 'tops': tops}
        ckpt['record'] = record
        torch.save(ckpt, ckpt_path / f'{step:d}.pth')

    hvp_solver.close()
Beispiel #24
0
def train(train_source_iter: ForeverDataIterator,
          train_target_iter: ForeverDataIterator, model: ImageClassifier,
          domain_adv_D: DomainAdversarialLoss,
          domain_adv_D_0: DomainAdversarialLoss, importance_weight_module,
          optimizer: SGD, lr_scheduler: LambdaLR, epoch: int,
          args: argparse.Namespace):
    batch_time = AverageMeter('Time', ':5.2f')
    data_time = AverageMeter('Data', ':5.2f')
    losses = AverageMeter('Loss', ':6.2f')
    cls_accs = AverageMeter('Cls Acc', ':3.1f')
    tgt_accs = AverageMeter('Tgt Acc', ':3.1f')
    domain_accs_D = AverageMeter('Domain Acc for D', ':3.1f')
    domain_accs_D_0 = AverageMeter('Domain Acc for D_0', ':3.1f')
    partial_classes_weights = AverageMeter('Partial Weight', ':3.2f')
    non_partial_classes_weights = AverageMeter('Non-Partial Weight', ':3.2f')

    progress = ProgressMeter(args.iters_per_epoch, [
        batch_time, data_time, losses, cls_accs, tgt_accs, domain_accs_D,
        domain_accs_D_0, partial_classes_weights, non_partial_classes_weights
    ],
                             prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()
    domain_adv_D.train()
    domain_adv_D_0.train()

    end = time.time()
    for i in range(args.iters_per_epoch):
        x_s, labels_s = next(train_source_iter)
        x_t, labels_t = next(train_target_iter)

        x_s = x_s.to(device)
        x_t = x_t.to(device)
        labels_s = labels_s.to(device)
        labels_t = labels_t.to(device)

        # measure data loading time
        data_time.update(time.time() - end)

        # compute output
        x = torch.cat((x_s, x_t), dim=0)
        y, f = model(x)
        y_s, y_t = y.chunk(2, dim=0)
        f_s, f_t = f.chunk(2, dim=0)

        # classification loss
        cls_loss = F.cross_entropy(y_s, labels_s)

        # domain adversarial loss for D
        adv_loss_D = domain_adv_D(f_s.detach(), f_t.detach())

        # get importance weights
        w_s = importance_weight_module.get_importance_weight(f_s)
        # domain adversarial loss for D_0
        adv_loss_D_0 = domain_adv_D_0(f_s, f_t, w_s=w_s)

        # entropy loss
        y_t = F.softmax(y_t, dim=1)
        entropy_loss = entropy(y_t, reduction='mean')

        loss = cls_loss + 1.5 * args.trade_off * adv_loss_D + \
               args.trade_off * adv_loss_D_0 + args.gamma * entropy_loss

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        cls_acc = accuracy(y_s, labels_s)[0]
        tgt_acc = accuracy(y_t, labels_t)[0]

        losses.update(loss.item(), x_s.size(0))
        cls_accs.update(cls_acc.item(), x_s.size(0))
        tgt_accs.update(tgt_acc.item(), x_s.size(0))
        domain_accs_D.update(domain_adv_D.domain_discriminator_accuracy,
                             x_s.size(0))
        domain_accs_D_0.update(domain_adv_D_0.domain_discriminator_accuracy,
                               x_s.size(0))

        # debug: output class weight averaged on the partial classes and non-partial classes respectively
        partial_class_weight, non_partial_classes_weight = \
            importance_weight_module.get_partial_classes_weight(w_s, labels_s)
        partial_classes_weights.update(partial_class_weight.item(),
                                       x_s.size(0))
        non_partial_classes_weights.update(non_partial_classes_weight.item(),
                                           x_s.size(0))

        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
class ELECTRATrainer:
    """
    ELECTRATrainer make the pretrained ELECTRA model 
    """
    def __init__(self,
                 electra: AttentionModel,
                 vocab_size: int,
                 train_dataloader: DataLoader,
                 train_orig_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 100,
                 log_file=None,
                 freeze_embed=0,
                 class_weights=None):
        """
        :param electra: ELECTRA model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """
        self.softmax = torch.nn.Softmax()
        # Setup cuda device for ELECTRA training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
        self.hardware = "cuda" if cuda_condition else "cpu"

        self.freeze_embed = freeze_embed

        self.electra = electra
        self.electra = self.electra.to(self.device)
        self.electra = self.electra.float()
        #pdb.set_trace()
        self.loss = nn.MSELoss()
        #self.loss = nn.CrossEntropyLoss()
        self.loss.to(self.device)

        #pdb.set_trace()
        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for ELECTRA" % torch.cuda.device_count())
            self.electra = nn.DataParallel(self.electra,
                                           device_ids=cuda_devices)
            self.hardware = "parallel"

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.train_orig_data = train_orig_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        #self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        #self.optim_schedule = ScheduledOptim(self.optim, self.electra.hidden, n_warmup_steps=warmup_steps)
        if freeze_embed == 1:
            if self.hardware == "parallel":
                self.electra.module.embed_layer.weight.requires_grad = False
            else:
                self.electra.embed_layer.weight.requires_grad = False
        elif freeze_embed == 2:
            self.freeze_embed_idx = torch.arange(26726, dtype=torch.long).to(
                self.device)
        self.optim = SGD([
            param for param in self.electra.parameters()
            if param.requires_grad == True
        ],
                         lr=lr,
                         momentum=0.9)

        self.log_freq = log_freq

        # clear log file
        if log_file:
            self.log_file = log_file
            with open(self.log_file, "w+") as f:
                f.write(
                    "EPOCH,MODE, AVG LOSS, TOTAL CORRECT, TOTAL ELEMENTS, ACCURACY, AUC, AUPR, TOTAL POSITIVE CORRECT, TOTAL POSITIVE, ACCURACY\n"
                )
        print("Total Parameters:",
              sum([p.nelement() for p in self.electra.parameters()]))

    @staticmethod
    def calc_auc(y_true, y_probas, show_plot=False):
        fpr, tpr, thresholds = metrics.roc_curve(y_true, y_probas, pos_label=1)
        auc_score = metrics.auc(fpr, tpr)
        if show_plot:
            plt.figure()
            plt.plot(fpr, tpr)
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver operating characteristic')
            plt.show()
        return auc_score

    @staticmethod
    def calc_aupr(y_true, y_probas, show_plot=False):
        precision, recall, thresholds = metrics.precision_recall_curve(
            y_true, y_probas, pos_label=1)
        aupr_score = metrics.auc(recall, precision)
        if show_plot:
            plt.figure()
            plt.plot(recall, precision)
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Receiver operating characteristic')
            plt.show()
        return aupr_score

    def train(self, epoch):
        self.electra.train()
        self.iteration(epoch, self.train_data, True, "train")

    def train_orig_dist(self, epoch):
        self.electra.eval()
        self.iteration(epoch, self.train_orig_data, False, "train_orig")

    def test(self, epoch):
        self.electra.eval()
        self.iteration(epoch, self.test_data, False, "test")

    def iteration(self, epoch, data_loader, train, str_code):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        cumulative_loss = 0.0

        total_correct = 0
        total_samples = 0
        total_positive_correct = 0
        total_positive = 0
        all_scores = []
        all_labels = []
        for i, data in data_iter:
            #pdb.set_trace()
            #print(i)

            all_labels.append(data["electra_label"])
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            #create attention mask
            #pdb.set_trace()
            zero_boolean = torch.eq(data["species_frequencies"], 0)
            mask = torch.ones(zero_boolean.shape,
                              dtype=torch.float).to(self.device)
            mask = mask.masked_fill(zero_boolean, 0)

            # 1. forward the next_sentence_prediction and masked_lm model
            scores = self.electra.forward(data["electra_input"], mask)
            # 3. backward and optimization only in train
            #for mse
            loss = self.loss(scores, data["electra_label"].float())
            #for cross entropy
            #loss = self.loss(scores,data["electra_label"].squeeze())
            if train:
                #self.optim_schedule.zero_grad()
                #pdb.set_trace()
                self.optim.zero_grad()
                loss.backward()
                #self.optim_schedule.step_and_update_lr()
                if self.freeze_embed == 2:
                    if self.hardware == "parallel":
                        self.electra.module.embed_layer.weight.grad[
                            self.freeze_embed_idx] = 0
                    else:
                        self.electra.embed_layer.weight.grad[
                            self.freeze_embed_idx] = 0
                self.optim.step()

            all_scores.append(scores.detach().cpu())
            #for MSE
            predictions = scores >= 0.5
            #for Cross Entropy
            #predictions = scores.max(1).indices
            #predictions = predictions.unsqueeze(0).reshape(data["electra_label"].shape)

            #get accuracy for all tokens
            total_correct += torch.sum(
                predictions == data["electra_label"]).item()
            total_samples += data["electra_input"].shape[0]

            positive_inds = data["electra_label"].nonzero(as_tuple=True)
            total_positive_correct += torch.sum(
                predictions[positive_inds] == data["electra_label"]
                [positive_inds]).item()
            total_positive += data["electra_label"].nonzero().shape[0]

            log_loss = 0
            if self.hardware == "parallel":
                cumulative_loss += loss.sum().item()
                log_loss = loss.sum().item()

            else:
                cumulative_loss += loss.item()
                log_loss = loss.item()
            if i % self.log_freq == 0:
                if total_positive > 0:
                    data_iter.write(
                        "epoch: {}, iter: {}, avg loss: {},accuracy: {}/{}={:.2f}%,pos accuracy: {}/{}={:.2f}%, loss: {}"
                        .format(epoch, i, cumulative_loss / (i + 1),
                                total_correct, total_samples,
                                total_correct / total_samples * 100,
                                total_positive_correct, total_positive,
                                total_positive_correct / total_positive * 100,
                                log_loss))
                else:
                    data_iter.write(
                        "epoch: {}, iter: {}, avg loss: {},accuracy: {}/{}={:.2f}%,pos accuracy: 0/0, loss: {}"
                        .format(epoch, i, cumulative_loss / (i + 1),
                                total_correct, total_samples,
                                total_correct / total_samples * 100, log_loss))

            del data
            del mask
            del loss
            del scores
            del predictions
            del positive_inds

        #for MSE
        auc_score = ELECTRATrainer.calc_auc(
            torch.cat(all_labels).flatten().numpy(),
            torch.cat(all_scores).flatten().numpy())
        aupr_score = ELECTRATrainer.calc_aupr(
            torch.cat(all_labels).flatten().numpy(),
            torch.cat(all_scores).flatten().numpy())

        #for Cross entropy
        #auc_score = ELECTRATrainer.calc_auc(torch.cat(all_labels).flatten().numpy(),torch.cat(all_scores)[:,1].numpy())
        #aupr_score = ELECTRATrainer.calc_aupr(torch.cat(all_labels).flatten().numpy(),torch.cat(all_scores)[:,1].numpy())

        print("EP{}_{}, avg_loss={}, accuracy={:.2f}%".format(
            epoch, str_code, cumulative_loss / len(data_iter),
            total_correct / total_samples * 100))
        if self.log_file:
            with open(self.log_file, "a") as f:
                f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                    epoch, str_code, cumulative_loss / len(data_iter),
                    total_correct, total_samples,
                    total_correct / total_samples * 100, auc_score, aupr_score,
                    total_positive_correct, total_positive,
                    total_positive_correct / total_positive * 100))

    def save(self, epoch, file_path):
        """
        Saving the current ELECTRA model on file_path

        :param epoch: current epoch number
        :param file_path: model output directory
        """
        output_file_path = file_path + "_epoch{}".format(epoch)
        if self.hardware == "parallel":
            #pdb.set_trace()
            self.electra.module.discriminator.save_pretrained(
                output_file_path + "_disc")
            torch.save(self.electra.module.embed_layer.state_dict(),
                       output_file_path + "_embed")
        else:
            self.electra.discriminator.save_pretrained(output_file_path +
                                                       "_disc")
            torch.save(self.electra.embed_layer.state_dict(),
                       output_file_path + "_embed")
def train(class_num,
          epoch_num,
          config,
          x_train,
          y_train,
          x_val,
          y_val,
          seed=32):
    epoch_num = int(epoch_num)
    print(epoch_num, config)

    train_batch_size = config['train_batch_size']
    init_lr = config['init_lr']
    lr_decay_factor = config['lr_decay_factor']
    weight_decay = config['weight_decay']
    momentum = config['momentum']
    nesterov = True if config['nesterov'] == 'True' else False

    from torchvision.models.resnet import resnet18
    model = resnet18(num_classes=class_num).to(gpu_device)

    x_train = np.transpose(x_train, (0, 3, 1, 2))
    x_val = np.transpose(x_val, (0, 3, 1, 2))

    x_train_data = torch.from_numpy(x_train)
    y_train_data = torch.from_numpy(y_train)
    train_dataset = TensorDataset(x_train_data, y_train_data)
    x_val_data = torch.from_numpy(x_val)
    y_val_data = torch.from_numpy(y_val)
    val_dataset = TensorDataset(x_val_data, y_val_data)

    trainloader = DataLoader(train_dataset,
                             batch_size=train_batch_size,
                             num_workers=5,
                             shuffle=True)
    validloader = DataLoader(val_dataset,
                             batch_size=100,
                             num_workers=5,
                             shuffle=False)

    optimizer = SGD(params=model.parameters(),
                    lr=init_lr,
                    momentum=momentum,
                    weight_decay=weight_decay,
                    nesterov=nesterov)

    scheduler = MultiStepLR(
        optimizer,
        milestones=[int(epoch_num / 2),
                    int(epoch_num * 3 / 4)],
        gamma=lr_decay_factor)
    loss_func = nn.CrossEntropyLoss()

    for epoch_id in range(epoch_num):
        model.train()
        # print('Current learning rate: %.5f' % optimizer.state_dict()['param_groups'][0]['lr'])
        epoch_avg_loss = 0
        epoch_avg_acc = 0
        val_avg_loss = 0
        val_avg_acc = 0
        num_train_samples = 0
        num_val_samples = 0
        for i, data in enumerate(trainloader):
            batch_x, batch_y = data[0].float(), data[1].long()
            num_train_samples += len(batch_x)
            logits = model(batch_x.float().to(gpu_device))
            loss = loss_func(logits, batch_y.to(gpu_device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_avg_loss += loss.to('cpu').detach() * len(batch_x)
            prediction = np.argmax(logits.to('cpu').detach().numpy(), axis=-1)
            epoch_avg_acc += accuracy_score(
                prediction,
                batch_y.to('cpu').detach().numpy()) * len(batch_x)

        epoch_avg_loss /= num_train_samples
        epoch_avg_acc /= num_train_samples

        print('Epoch %d: Train loss %.4f, train acc %.4f' %
              (epoch_id, epoch_avg_loss, epoch_avg_acc))

        if validloader is not None:
            model.eval()
            with torch.no_grad():
                for i, data in enumerate(validloader):
                    batch_x, batch_y = data[0].float(), data[1].long()
                    logits = model(batch_x.float().to(gpu_device))
                    val_loss = loss_func(logits, batch_y.to(gpu_device))
                    num_val_samples += len(batch_x)
                    val_avg_loss += val_loss.to('cpu').detach() * len(batch_x)

                    prediction = np.argmax(logits.to('cpu').detach().numpy(),
                                           axis=-1)
                    val_avg_acc += accuracy_score(
                        prediction,
                        batch_y.to('cpu').detach().numpy()) * len(batch_x)

                val_avg_loss /= num_val_samples
                val_avg_acc /= num_val_samples
                print('Epoch %d: Val loss %.4f, val acc %.4f' %
                      (epoch_id, val_avg_loss, val_avg_acc))

        scheduler.step()
    return 1 - val_avg_acc
Beispiel #27
0
def train(model,
          state,
          path,
          annotations,
          val_path,
          val_annotations,
          resize,
          max_size,
          jitter,
          batch_size,
          iterations,
          val_iterations,
          mixed_precision,
          lr,
          warmup,
          milestones,
          gamma,
          rank=0,
          world=1,
          no_apex=False,
          use_dali=True,
          verbose=True,
          metrics_url=None,
          logdir=None,
          rotate_augment=False,
          augment_brightness=0.0,
          augment_contrast=0.0,
          augment_hue=0.0,
          augment_saturation=0.0,
          regularization_l2=0.0001,
          rotated_bbox=False,
          absolute_angle=False):
    'Train the model on the given dataset'

    # Prepare model
    nn_model = model
    stride = model.stride

    model = convert_fixedbn_model(model)
    if torch.cuda.is_available():
        model = model.to(memory_format=torch.channels_last).cuda()

    # Setup optimizer and schedule
    optimizer = SGD(model.parameters(),
                    lr=lr,
                    weight_decay=regularization_l2,
                    momentum=0.9)

    is_master = rank == 0
    if not no_apex:
        loss_scale = "dynamic" if use_dali else "128.0"
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level='O2' if mixed_precision else 'O0',
            keep_batchnorm_fp32=True,
            loss_scale=loss_scale,
            verbosity=is_master)

    if world > 1:
        model = DDP(model, device_ids=[rank]) if no_apex else ADDP(model)
    model.train()

    if 'optimizer' in state:
        optimizer.load_state_dict(state['optimizer'])

    def schedule(train_iter):
        if warmup and train_iter <= warmup:
            return 0.9 * train_iter / warmup + 0.1
        return gamma**len([m for m in milestones if m <= train_iter])

    scheduler = LambdaLR(optimizer, schedule)
    if 'scheduler' in state:
        scheduler.load_state_dict(state['scheduler'])

    # Prepare dataset
    if verbose: print('Preparing dataset...')
    if rotated_bbox:
        if use_dali:
            raise NotImplementedError(
                "This repo does not currently support DALI for rotated bbox detections."
            )
        data_iterator = RotatedDataIterator(
            path,
            jitter,
            max_size,
            batch_size,
            stride,
            world,
            annotations,
            training=True,
            rotate_augment=rotate_augment,
            augment_brightness=augment_brightness,
            augment_contrast=augment_contrast,
            augment_hue=augment_hue,
            augment_saturation=augment_saturation,
            absolute_angle=absolute_angle)
    else:
        data_iterator = (DaliDataIterator if use_dali else DataIterator)(
            path,
            jitter,
            max_size,
            batch_size,
            stride,
            world,
            annotations,
            training=True,
            rotate_augment=rotate_augment,
            augment_brightness=augment_brightness,
            augment_contrast=augment_contrast,
            augment_hue=augment_hue,
            augment_saturation=augment_saturation)
    if verbose: print(data_iterator)

    if verbose:
        print('    device: {} {}'.format(
            world, 'cpu' if not torch.cuda.is_available() else
            'GPU' if world == 1 else 'GPUs'))
        print('     batch: {}, precision: {}'.format(
            batch_size, 'mixed' if mixed_precision else 'full'))
        print(' BBOX type:', 'rotated' if rotated_bbox else 'axis aligned')
        print('Training model for {} iterations...'.format(iterations))

    # Create TensorBoard writer
    if is_master and logdir is not None:
        from torch.utils.tensorboard import SummaryWriter
        if verbose:
            print('Writing TensorBoard logs to: {}'.format(logdir))
        writer = SummaryWriter(log_dir=logdir)

    scaler = GradScaler()
    profiler = Profiler(['train', 'fw', 'bw'])
    iteration = state.get('iteration', 0)
    while iteration < iterations:
        cls_losses, box_losses = [], []
        for i, (data, target) in enumerate(data_iterator):
            if iteration >= iterations:
                break

            # Forward pass
            profiler.start('fw')

            optimizer.zero_grad()
            if not no_apex:
                cls_loss, box_loss = model([
                    data.contiguous(memory_format=torch.channels_last), target
                ])
            else:
                with autocast():
                    cls_loss, box_loss = model([
                        data.contiguous(memory_format=torch.channels_last),
                        target
                    ])
            del data
            profiler.stop('fw')

            # Backward pass
            profiler.start('bw')
            if not no_apex:
                with amp.scale_loss(cls_loss + box_loss,
                                    optimizer) as scaled_loss:
                    scaled_loss.backward()
                optimizer.step()
            else:
                scaler.scale(cls_loss + box_loss).backward()
                scaler.step(optimizer)
                scaler.update()

            scheduler.step()

            # Reduce all losses
            cls_loss, box_loss = cls_loss.mean().clone(), box_loss.mean(
            ).clone()
            if world > 1:
                torch.distributed.all_reduce(cls_loss)
                torch.distributed.all_reduce(box_loss)
                cls_loss /= world
                box_loss /= world
            if is_master:
                cls_losses.append(cls_loss)
                box_losses.append(box_loss)

            if is_master and not isfinite(cls_loss + box_loss):
                raise RuntimeError('Loss is diverging!\n{}'.format(
                    'Try lowering the learning rate.'))

            del cls_loss, box_loss
            profiler.stop('bw')

            iteration += 1
            profiler.bump('train')
            if is_master and (profiler.totals['train'] > 60
                              or iteration == iterations):
                focal_loss = torch.stack(list(cls_losses)).mean().item()
                box_loss = torch.stack(list(box_losses)).mean().item()
                learning_rate = optimizer.param_groups[0]['lr']
                if verbose:
                    msg = '[{:{len}}/{}]'.format(iteration,
                                                 iterations,
                                                 len=len(str(iterations)))
                    msg += ' focal loss: {:.3f}'.format(focal_loss)
                    msg += ', box loss: {:.3f}'.format(box_loss)
                    msg += ', {:.3f}s/{}-batch'.format(profiler.means['train'],
                                                       batch_size)
                    msg += ' (fw: {:.3f}s, bw: {:.3f}s)'.format(
                        profiler.means['fw'], profiler.means['bw'])
                    msg += ', {:.1f} im/s'.format(batch_size /
                                                  profiler.means['train'])
                    msg += ', lr: {:.2g}'.format(learning_rate)
                    print(msg, flush=True)

                if is_master and logdir is not None:
                    writer.add_scalar('focal_loss', focal_loss, iteration)
                    writer.add_scalar('box_loss', box_loss, iteration)
                    writer.add_scalar('learning_rate', learning_rate,
                                      iteration)
                    del box_loss, focal_loss

                if metrics_url:
                    post_metrics(
                        metrics_url, {
                            'focal loss': mean(cls_losses),
                            'box loss': mean(box_losses),
                            'im_s': batch_size / profiler.means['train'],
                            'lr': learning_rate
                        })

                # Save model weights
                state.update({
                    'iteration': iteration,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                })
                with ignore_sigint():
                    nn_model.save(state)

                profiler.reset()
                del cls_losses[:], box_losses[:]

            if val_annotations and (iteration == iterations
                                    or iteration % val_iterations == 0):
                stats = infer(model,
                              val_path,
                              None,
                              resize,
                              max_size,
                              batch_size,
                              annotations=val_annotations,
                              mixed_precision=mixed_precision,
                              is_master=is_master,
                              world=world,
                              use_dali=use_dali,
                              no_apex=no_apex,
                              is_validation=True,
                              verbose=False,
                              rotated_bbox=rotated_bbox)
                model.train()
                if is_master and logdir is not None and stats is not None:
                    writer.add_scalar('Validation_Precision/mAP', stats[0],
                                      iteration)
                    writer.add_scalar('Validation_Precision/[email protected]',
                                      stats[1], iteration)
                    writer.add_scalar('Validation_Precision/[email protected]',
                                      stats[2], iteration)
                    writer.add_scalar('Validation_Precision/mAP (small)',
                                      stats[3], iteration)
                    writer.add_scalar('Validation_Precision/mAP (medium)',
                                      stats[4], iteration)
                    writer.add_scalar('Validation_Precision/mAP (large)',
                                      stats[5], iteration)
                    writer.add_scalar('Validation_Recall/mAR (max 1 Dets)',
                                      stats[6], iteration)
                    writer.add_scalar('Validation_Recall/mAR (max 10 Dets)',
                                      stats[7], iteration)
                    writer.add_scalar('Validation_Recall/mAR (max 100 Dets)',
                                      stats[8], iteration)
                    writer.add_scalar('Validation_Recall/mAR (small)',
                                      stats[9], iteration)
                    writer.add_scalar('Validation_Recall/mAR (medium)',
                                      stats[10], iteration)
                    writer.add_scalar('Validation_Recall/mAR (large)',
                                      stats[11], iteration)

            if (iteration == iterations
                    and not rotated_bbox) or (iteration > iterations
                                              and rotated_bbox):
                break

    if is_master and logdir is not None:
        writer.close()
Beispiel #28
0
class Trainer(object):
    """
    Trainer encapsulates all the logic necessary for
    training the Recurrent Attention Model.

    All hyperparameters are provided by the user in the
    config file.
    """
    def __init__(self, config, data_loader):
        """
        Construct a new Trainer instance.

        Args
        ----
        - config: object containing command line arguments.
        - data_loader: data iterator
        """
        self.config = config

        # glimpse network params
        self.patch_size = config.patch_size
        self.glimpse_scale = config.glimpse_scale
        self.num_patches = config.num_patches
        self.loc_hidden = config.loc_hidden
        self.glimpse_hidden = config.glimpse_hidden

        # core network params
        self.num_glimpses = config.num_glimpses
        self.hidden_size = config.hidden_size

        # reinforce params
        self.std = config.std
        self.M = config.M

        # data params
        if config.is_train:
            self.train_loader = data_loader[0]
            self.valid_loader = data_loader[1]
            self.num_train = len(self.train_loader.sampler.indices)
            self.num_valid = len(self.valid_loader.sampler.indices)
        else:
            self.test_loader = data_loader
            self.num_test = len(self.test_loader.dataset)
        self.num_classes = 10
        self.num_channels = 1

        # training params
        self.epochs = config.epochs
        self.start_epoch = 0
        self.momentum = config.momentum
        self.lr = config.init_lr

        # misc params
        self.use_gpu = config.use_gpu
        self.best = config.best
        self.ckpt_dir = config.ckpt_dir
        self.logs_dir = config.logs_dir
        self.best_valid_acc = 0.
        self.counter = 0
        self.patience = config.patience
        self.use_tensorboard = config.use_tensorboard
        self.resume = config.resume
        self.print_freq = config.print_freq
        self.plot_freq = config.plot_freq
        self.model_name = 'ram_{}_{}x{}_{}'.format(config.num_glimpses,
                                                   config.patch_size,
                                                   config.patch_size,
                                                   config.glimpse_scale)

        self.plot_dir = './plots/' + self.model_name + '/'
        if not os.path.exists(self.plot_dir):
            os.makedirs(self.plot_dir)

        # configure tensorboard logging
        if self.use_tensorboard:
            tensorboard_dir = self.logs_dir + self.model_name
            print('[*] Saving tensorboard logs to {}'.format(tensorboard_dir))
            if not os.path.exists(tensorboard_dir):
                os.makedirs(tensorboard_dir)
            configure(tensorboard_dir)

        # build RAM model
        self.model = RecurrentAttention(
            self.patch_size,
            self.num_patches,
            self.glimpse_scale,
            self.num_channels,
            self.loc_hidden,
            self.glimpse_hidden,
            self.std,
            self.hidden_size,
            self.num_classes,
        )
        if self.use_gpu:
            self.model.cuda()

        print('[*] Number of model parameters: {:,}'.format(
            sum([p.data.nelement() for p in self.model.parameters()])))

        # initialize optimizer and scheduler
        self.optimizer = SGD(
            self.model.parameters(),
            lr=self.lr,
            momentum=self.momentum,
        )
        self.scheduler = ReduceLROnPlateau(self.optimizer,
                                           'min',
                                           patience=self.patience)

    def reset(self):
        """
        Initialize the hidden state of the core network
        and the location vector.

        This is called once every time a new minibatch
        `x` is introduced.
        """
        dtype = torch.cuda.FloatTensor if self.use_gpu else torch.FloatTensor

        h_t = torch.zeros(self.batch_size, self.hidden_size)
        h_t = Variable(h_t).type(dtype)

        l_t = torch.Tensor(self.batch_size, 2).uniform_(-1, 1)
        l_t = Variable(l_t).type(dtype)

        return h_t, l_t

    def train(self):
        """
        Train the model on the training set.

        A checkpoint of the model is saved after each epoch
        and if the validation accuracy is improved upon,
        a separate ckpt is created for use on the test set.
        """
        # load the most recent checkpoint
        if self.resume:
            self.load_checkpoint(best=False)

        print("\n[*] Train on {} samples, validate on {} samples".format(
            self.num_train, self.num_valid))

        for epoch in range(self.start_epoch, self.epochs):

            print('\nEpoch: {}/{} - LR: {:.6f}'.format(epoch + 1, self.epochs,
                                                       self.lr))

            # train for 1 epoch
            train_loss, train_acc = self.train_one_epoch(epoch)

            # evaluate on validation set
            valid_loss, valid_acc = self.validate(epoch)

            # reduce lr if validation loss plateaus
            self.scheduler.step(valid_loss)

            is_best = valid_acc > self.best_valid_acc
            msg1 = "train loss: {:.3f} - train acc: {:.3f} "
            msg2 = "- val loss: {:.3f} - val acc: {:.3f}"
            if is_best:
                msg2 += " [*]"
            msg = msg1 + msg2
            print(msg.format(train_loss, train_acc, valid_loss, valid_acc))

            # check for improvement
            if not is_best:
                self.counter += 1
            if self.counter > self.patience:
                print("[!] No improvement in a while, stopping training.")
                return
            self.best_valid_acc = max(valid_acc, self.best_valid_acc)
            self.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model_state': self.model.state_dict(),
                    'optim_state': self.optimizer.state_dict(),
                    'best_valid_acc': self.best_valid_acc,
                }, is_best)

    def train_one_epoch(self, epoch):
        """
        Train the model for 1 epoch of the training set.

        An epoch corresponds to one full pass through the entire
        training set in successive mini-batches.

        This is used by train() and should not be called manually.
        """
        batch_time = AverageMeter()
        losses = AverageMeter()
        accs = AverageMeter()

        tic = time.time()
        with tqdm(total=self.num_train) as pbar:
            for i, (x, y) in enumerate(self.train_loader):
                if self.use_gpu:
                    x, y = x.cuda(), y.cuda()
                x, y = Variable(x), Variable(y)

                plot = False
                if (epoch % self.plot_freq == 0) and (i == 0):
                    plot = True

                # initialize location vector and hidden state
                self.batch_size = x.shape[0]
                h_t, l_t = self.reset()

                # save images
                imgs = []
                imgs.append(x[0:9])

                # extract the glimpses
                locs = []
                log_pi = []
                baselines = []
                for t in range(self.num_glimpses - 1):
                    # forward pass through model
                    h_t, l_t, b_t, p = self.model(x, l_t, h_t)

                    # store
                    locs.append(l_t[0:9])
                    baselines.append(b_t)
                    log_pi.append(p)

                # last iteration
                h_t, l_t, b_t, log_probas, p = self.model(x,
                                                          l_t,
                                                          h_t,
                                                          last=True)
                log_pi.append(p)
                baselines.append(b_t)
                locs.append(l_t[0:9])

                # convert list to tensors and reshape
                baselines = torch.stack(baselines).transpose(1, 0)
                log_pi = torch.stack(log_pi).transpose(1, 0)

                # calculate reward
                predicted = torch.max(log_probas, 1)[1]
                R = (predicted.detach() == y).float()
                R = R.unsqueeze(1).repeat(1, self.num_glimpses)

                # compute losses for differentiable modules
                loss_action = F.nll_loss(log_probas, y)
                loss_baseline = F.mse_loss(baselines, R)

                # compute reinforce loss
                adjusted_reward = R - baselines.detach()
                loss_reinforce = torch.mean(-log_pi * adjusted_reward)

                # sum up into a hybrid loss
                loss = loss_action + loss_baseline + loss_reinforce

                # compute accuracy
                correct = (predicted == y).float()
                acc = 100 * (correct.sum() / len(y))

                # store
                losses.update(loss.data[0], x.size()[0])
                accs.update(acc.data[0], x.size()[0])

                # compute gradients and update SGD
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # measure elapsed time
                toc = time.time()
                batch_time.update(toc - tic)

                pbar.set_description(
                    ("{:.1f}s - loss: {:.3f} - acc: {:.3f}".format(
                        (toc - tic), loss.data[0], acc.data[0])))
                pbar.update(self.batch_size)

                # dump the glimpses and locs
                if plot:
                    if self.use_gpu:
                        imgs = [g.cpu().data.numpy().squeeze() for g in imgs]
                        locs = [l.cpu().data.numpy() for l in locs]
                    else:
                        imgs = [g.data.numpy().squeeze() for g in imgs]
                        locs = [l.data.numpy() for l in locs]
                    pickle.dump(
                        imgs,
                        open(self.plot_dir + "g_{}.p".format(epoch + 1), "wb"))
                    pickle.dump(
                        locs,
                        open(self.plot_dir + "l_{}.p".format(epoch + 1), "wb"))

                # log to tensorboard
                if self.use_tensorboard:
                    iteration = epoch * len(self.train_loader) + i
                    log_value('train_loss', losses.avg, iteration)
                    log_value('train_acc', accs.avg, iteration)

            return losses.avg, accs.avg

    def validate(self, epoch):
        """
        Evaluate the model on the validation set.
        """
        losses = AverageMeter()
        accs = AverageMeter()

        for i, (x, y) in enumerate(self.valid_loader):
            if self.use_gpu:
                x, y = x.cuda(), y.cuda()
            x, y = Variable(x), Variable(y)

            # duplicate 10 times
            x = x.repeat(self.M, 1, 1, 1)

            # initialize location vector and hidden state
            self.batch_size = x.shape[0]
            h_t, l_t = self.reset()

            # extract the glimpses
            log_pi = []
            baselines = []
            for t in range(self.num_glimpses - 1):
                # forward pass through model
                h_t, l_t, b_t, p = self.model(x, l_t, h_t)

                # store
                baselines.append(b_t)
                log_pi.append(p)

            # last iteration
            h_t, l_t, b_t, log_probas, p = self.model(x, l_t, h_t, last=True)
            log_pi.append(p)
            baselines.append(b_t)

            # convert list to tensors and reshape
            baselines = torch.stack(baselines).transpose(1, 0)
            log_pi = torch.stack(log_pi).transpose(1, 0)

            # average
            log_probas = log_probas.view(self.M, -1, log_probas.shape[-1])
            log_probas = torch.mean(log_probas, dim=0)

            baselines = baselines.contiguous().view(self.M, -1,
                                                    baselines.shape[-1])
            baselines = torch.mean(baselines, dim=0)

            log_pi = log_pi.contiguous().view(self.M, -1, log_pi.shape[-1])
            log_pi = torch.mean(log_pi, dim=0)

            # calculate reward
            predicted = torch.max(log_probas, 1)[1]
            R = (predicted.detach() == y).float()
            R = R.unsqueeze(1).repeat(1, self.num_glimpses)

            # compute losses for differentiable modules
            loss_action = F.nll_loss(log_probas, y)
            loss_baseline = F.mse_loss(baselines, R)

            # compute reinforce loss
            adjusted_reward = R - baselines.detach()
            loss_reinforce = torch.mean(-log_pi * adjusted_reward)

            # sum up into a hybrid loss
            loss = loss_action + loss_baseline + loss_reinforce

            # compute accuracy
            correct = (predicted == y).float()
            acc = 100 * (correct.sum() / len(y))

            # store
            losses.update(loss.data[0], x.size()[0])
            accs.update(acc.data[0], x.size()[0])

            # log to tensorboard
            if self.use_tensorboard:
                iteration = epoch * len(self.valid_loader) + i
                log_value('valid_loss', losses.avg, iteration)
                log_value('valid_acc', accs.avg, iteration)

        return losses.avg, accs.avg

    def test(self):
        """
        Test the model on the held-out test data.
        This function should only be called at the very
        end once the model has finished training.
        """
        correct = 0

        # load the best checkpoint
        self.load_checkpoint(best=self.best)

        for i, (x, y) in enumerate(self.test_loader):
            if self.use_gpu:
                x, y = x.cuda(), y.cuda()
            x, y = Variable(x, volatile=True), Variable(y)

            # duplicate 10 times
            x = x.repeat(self.M, 1, 1, 1)

            # initialize location vector and hidden state
            self.batch_size = x.shape[0]
            h_t, l_t = self.reset()

            # extract the glimpses
            for t in range(self.num_glimpses - 1):
                # forward pass through model
                h_t, l_t, b_t, p = self.model(x, l_t, h_t)

            # last iteration
            h_t, l_t, b_t, log_probas, p = self.model(x, l_t, h_t, last=True)

            log_probas = log_probas.view(self.M, -1, log_probas.shape[-1])
            log_probas = torch.mean(log_probas, dim=0)

            pred = log_probas.data.max(1, keepdim=True)[1]
            correct += pred.eq(y.data.view_as(pred)).cpu().sum()

        perc = (100. * correct) / (self.num_test)
        print('[*] Test Acc: {}/{} ({:.2f}%)'.format(correct, self.num_test,
                                                     perc))

    def save_checkpoint(self, state, is_best):
        """
        Save a copy of the model so that it can be loaded at a future
        date. This function is used when the model is being evaluated
        on the test data.

        If this model has reached the best validation accuracy thus
        far, a seperate file with the suffix `best` is created.
        """
        # print("[*] Saving model to {}".format(self.ckpt_dir))

        filename = self.model_name + '_ckpt.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        torch.save(state, ckpt_path)

        if is_best:
            filename = self.model_name + '_model_best.pth.tar'
            shutil.copyfile(ckpt_path, os.path.join(self.ckpt_dir, filename))

    def load_checkpoint(self, best=False):
        """
        Load the best copy of a model. This is useful for 2 cases:

        - Resuming training with the most recent model checkpoint.
        - Loading the best validation model to evaluate on the test data.

        Params
        ------
        - best: if set to True, loads the best model. Use this if you want
          to evaluate your model on the test data. Else, set to False in
          which case the most recent version of the checkpoint is used.
        """
        print("[*] Loading model from {}".format(self.ckpt_dir))

        filename = self.model_name + '_ckpt.pth.tar'
        if best:
            filename = self.model_name + '_model_best.pth.tar'
        ckpt_path = os.path.join(self.ckpt_dir, filename)
        ckpt = torch.load(ckpt_path)

        # load variables from checkpoint
        self.start_epoch = ckpt['epoch']
        self.best_valid_acc = ckpt['best_valid_acc']
        self.model.load_state_dict(ckpt['model_state'])
        self.optimizer.load_state_dict(ckpt['optim_state'])

        if best:
            print("[*] Loaded {} checkpoint @ epoch {} "
                  "with best valid acc of {:.3f}".format(
                      filename, ckpt['epoch'] + 1, ckpt['best_valid_acc']))
        else:
            print("[*] Loaded {} checkpoint @ epoch {}".format(
                filename, ckpt['epoch'] + 1))
Beispiel #29
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional RNN-CNN')
    parser.add_argument('--mode',
                        choices=['RNN', 'LSTM', 'GRU'],
                        help='architecture of rnn',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=128,
                        help='Number of hidden units in RNN')
    parser.add_argument('--tag_space',
                        type=int,
                        default=0,
                        help='Dimension of tag space')
    parser.add_argument('--num_layers',
                        type=int,
                        default=1,
                        help='Number of layers of RNN')
    parser.add_argument('--num_filters',
                        type=int,
                        default=30,
                        help='Number of filters in CNN')
    parser.add_argument('--char_dim',
                        type=int,
                        default=30,
                        help='Dimension of Character embeddings')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weight for regularization')
    parser.add_argument('--dropout',
                        choices=['std', 'variational'],
                        help='type of dropout',
                        required=True)
    parser.add_argument('--p_rnn',
                        nargs=2,
                        type=float,
                        required=True,
                        help='dropout rate for RNN')
    parser.add_argument('--p_in',
                        type=float,
                        default=0.33,
                        help='dropout rate for input embeddings')
    parser.add_argument('--p_out',
                        type=float,
                        default=0.33,
                        help='dropout rate for output layer')
    parser.add_argument('--schedule',
                        type=int,
                        help='schedule for learning rate decay')
    parser.add_argument('--unk_replace',
                        type=float,
                        default=0.,
                        help='The rate to replace a singleton word with UNK')
    parser.add_argument('--embedding',
                        choices=['glove', 'senna', 'sskip', 'polyglot'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("NER")

    mode = args.mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    num_filters = args.num_filters
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    schedule = args.schedule
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_out = args.p_out
    unk_replace = args.unk_replace
    embedding = args.embedding
    embedding_path = args.embedding_dict

    embedd_dict, embedd_dim = utils.load_embedding_dict(
        embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, \
    chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner/", train_path, data_paths=[dev_path, test_path],
                                                                 embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    use_gpu = torch.cuda.is_available()

    data_train = conll03_data.read_data_to_variable(train_path,
                                                    word_alphabet,
                                                    char_alphabet,
                                                    pos_alphabet,
                                                    chunk_alphabet,
                                                    ner_alphabet,
                                                    use_gpu=use_gpu)
    num_data = sum(data_train[1])
    num_labels = ner_alphabet.size()

    data_dev = conll03_data.read_data_to_variable(dev_path,
                                                  word_alphabet,
                                                  char_alphabet,
                                                  pos_alphabet,
                                                  chunk_alphabet,
                                                  ner_alphabet,
                                                  use_gpu=use_gpu,
                                                  volatile=True)
    data_test = conll03_data.read_data_to_variable(test_path,
                                                   word_alphabet,
                                                   char_alphabet,
                                                   pos_alphabet,
                                                   chunk_alphabet,
                                                   ner_alphabet,
                                                   use_gpu=use_gpu,
                                                   volatile=True)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet,
                           chunk_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(
            -scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in list(word_alphabet.items()):
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(
                    -scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform
    if args.dropout == 'std':
        network = BiRecurrentConv(embedd_dim,
                                  word_alphabet.size(),
                                  char_dim,
                                  char_alphabet.size(),
                                  num_filters,
                                  window,
                                  mode,
                                  hidden_size,
                                  num_layers,
                                  num_labels,
                                  tag_space=tag_space,
                                  embedd_word=word_table,
                                  p_in=p_in,
                                  p_out=p_out,
                                  p_rnn=p_rnn,
                                  initializer=initializer)
    else:
        network = BiVarRecurrentConv(embedd_dim,
                                     word_alphabet.size(),
                                     char_dim,
                                     char_alphabet.size(),
                                     num_filters,
                                     window,
                                     mode,
                                     hidden_size,
                                     num_layers,
                                     num_labels,
                                     tag_space=tag_space,
                                     embedd_word=word_table,
                                     p_in=p_in,
                                     p_out=p_out,
                                     p_rnn=p_rnn,
                                     initializer=initializer)
    if use_gpu:
        network.cuda()

    lr = learning_rate
    # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma)
    optim = SGD(network.parameters(),
                lr=lr,
                momentum=momentum,
                weight_decay=gamma,
                nesterov=True)
    logger.info(
        "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" %
        (mode, num_layers, hidden_size, num_filters, tag_space))
    logger.info(
        "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)"
        % (gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" %
                (p_in, p_out, p_rnn))

    num_batches = num_data / batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    for epoch in range(1, num_epochs + 1):
        print(
            'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): '
            % (epoch, mode, args.dropout, lr, decay_rate, schedule))
        train_err = 0.
        train_corr = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch in range(1, num_batches + 1):
            word, char, _, _, labels, masks, lengths = conll03_data.get_batch_variable(
                data_train, batch_size, unk_replace=unk_replace)

            optim.zero_grad()
            loss, corr, _ = network.loss(
                word,
                char,
                labels,
                mask=masks,
                length=lengths,
                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
            loss.backward()
            optim.step()

            num_tokens = masks.data.sum()
            train_err += loss.data[0] * num_tokens
            train_corr += corr.data[0]
            train_total += num_tokens

            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            if batch % 100 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                    batch, num_batches, train_err / train_total,
                    train_corr * 100 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' %
              (num_batches, train_err / train_total,
               train_corr * 100 / train_total, time.time() - start_time))

        # evaluate performance on dev data
        network.eval()
        tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch)
        writer.start(tmp_filename)

        for batch in conll03_data.iterate_batch_variable(data_dev, batch_size):
            word, char, pos, chunk, labels, masks, lengths = batch
            _, _, preds = network.loss(
                word,
                char,
                labels,
                mask=masks,
                length=lengths,
                leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
            writer.write(word.data.cpu().numpy(),
                         pos.data.cpu().numpy(),
                         chunk.data.cpu().numpy(),
                         preds.data.cpu().numpy(),
                         labels.data.cpu().numpy(),
                         lengths.cpu().numpy())
        writer.close()
        acc, precision, recall, f1 = evaluate(tmp_filename)
        print(
            'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' %
            (acc, precision, recall, f1))

        if dev_f1 < f1:
            dev_f1 = f1
            dev_acc = acc
            dev_precision = precision
            dev_recall = recall
            best_epoch = epoch

            # evaluate on test data when better performance detected
            tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch)
            writer.start(tmp_filename)

            for batch in conll03_data.iterate_batch_variable(
                    data_test, batch_size):
                word, char, pos, chunk, labels, masks, lengths = batch
                _, _, preds = network.loss(
                    word,
                    char,
                    labels,
                    mask=masks,
                    length=lengths,
                    leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(word.data.cpu().numpy(),
                             pos.data.cpu().numpy(),
                             chunk.data.cpu().numpy(),
                             preds.data.cpu().numpy(),
                             labels.data.cpu().numpy(),
                             lengths.cpu().numpy())
            writer.close()
            test_acc, test_precision, test_recall, test_f1 = evaluate(
                tmp_filename)

        print(
            "best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
            % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        print(
            "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)"
            % (test_acc, test_precision, test_recall, test_f1, best_epoch))

        if epoch % schedule == 0:
            lr = learning_rate / (1.0 + epoch * decay_rate)
            optim = SGD(network.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=gamma,
                        nesterov=True)
Beispiel #30
0
def train(args):
    # srun -p gpu --gres=gpu:1 python main_dsh.py
    sketch_folder, imsk_folder, im_folder, path_semantic, train_class, test_class = _parse_args_paths(
        args)
    logger = make_logger(join(mkdir(args.save_dir), curr_time_str() + '.log'))
    if DEBUG:
        train_class = train_class[:2]
        test_class = test_class[:2]
        args.print_every = 2
        args.save_every = 8
        args.steps = 20
        args.batch_size = 2
        args.npy_dir = NPY_FOLDER_SKETCHY

    # logger.info("try loading data_train")
    data_train = DSH_dataloader(folder_sk=sketch_folder,
                                folder_im=im_folder,
                                clss=train_class,
                                folder_nps=args.npy_dir,
                                folder_imsk=imsk_folder,
                                normalize01=False,
                                doaug=False,
                                m=args.m,
                                path_semantic=path_semantic,
                                folder_saving=join(mkdir(args.save_dir),
                                                   'train_saving'),
                                logger=logger)
    dataloader_train = DataLoader(dataset=data_train,
                                  batch_size=args.batch_size,
                                  shuffle=False)
    # logger.info("try loading data_test")
    data_test = DSH_dataloader(folder_sk=sketch_folder,
                               clss=test_class,
                               folder_nps=args.npy_dir,
                               path_semantic=path_semantic,
                               folder_imsk=imsk_folder,
                               normalize01=False,
                               doaug=False,
                               m=args.m,
                               folder_saving=join(mkdir(args.save_dir),
                                                  'test_saving'),
                               logger=logger)

    model = DSH(m=args.m, config=args.config)
    model.cuda()

    optimizer = SGD(params=model.parameters(), lr=args.lr, momentum=0.9)

    # logger.info("optimizer inited")
    steps = _try_load(args, logger, model, optimizer)
    logger.info(str(args))
    args.steps += steps
    dsh_loss = _DSH_loss(gamma=args.gamma)
    model.train()
    l2_regularization = _Regularization(model, args.l2_reg, p=2, logger=None)
    loss_sum = []
    # logger.info("iterations")
    # iterations
    while True:
        # logger.info("update D")
        # 1. update D
        data_train.D = update_D(bi=data_train.BI,
                                bs=data_train.BS,
                                vec_bi=data_train.vec_bi,
                                vec_bs=data_train.vec_bs)
        # logger.info("update BI/BS")
        # 2. update BI/BS
        feats_labels_sk, feats_labels_im = _extract_feats_sk_im(
            data=data_train, model=model, batch_size=args.batch_size)

        data_train.BI, data_train.BS = update_B(bi=data_train.BI,
                                                bs=data_train.BS,
                                                vec_bi=data_train.vec_bi,
                                                vec_bs=data_train.vec_bs,
                                                W=data_train.W,
                                                D=data_train.D,
                                                Fi=feats_labels_im[0],
                                                Fs=feats_labels_sk[0],
                                                lamb=args.lamb,
                                                gamma=args.gamma)
        # logger.info("update network parameters")
        # 3. update network parameters
        for _, (sketch, code_of_sketch, image, sketch_token,
                code_of_image) in enumerate(dataloader_train):

            sketch_feats, im_feats = model(sketch.cuda(), sketch_token.cuda(),
                                           image.cuda())
            loss = dsh_loss(sketch_feats, im_feats, code_of_sketch.cuda(), code_of_image.cuda()) \
                    + l2_regularization()
            loss = loss / args.update_every
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            if (steps + 1) % args.update_every == 0:
                optimizer.step()
                optimizer.zero_grad()
            loss_sum.append(float(loss.item() * args.update_every))
            if (steps + 1) % args.save_every == 0:
                _test_and_save(steps=steps,
                               optimizer=optimizer,
                               data_test=data_test,
                               model=model,
                               logger=logger,
                               args=args,
                               loss_sum=loss_sum)
                data_train.save_params()

            if (steps + 1) % args.print_every == 0:
                loss_sum = [np.mean(loss_sum)]
                logger.info('step: {},  loss: {}'.format(steps, loss_sum[0]))

            steps += 1
            if steps >= args.steps: break
        dr_dec(optimizer=optimizer, args=args)
        if steps >= args.steps: break
Beispiel #31
0
                       nesterov=True)
    psi_optimizer = PsiSGD(psis,
                           lr=0.1,
                           momentum=0.9,
                           weight_decay=2e-4,
                           nesterov=True,
                           num_data=50000)

    for epoch in range(args.epochs):
        bayesian_net.train()
        for i, (input, target) in enumerate(train_loader):
            input = input.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

            output = bayesian_net(input)
            loss = torch.nn.functional.cross_entropy(output, target)

            mu_optimizer.zero_grad()
            psi_optimizer.zero_grad()
            loss.backward()
            mu_optimizer.step()
            psi_optimizer.step()

            if i % 100 == 0:
                print("Epoch {}, ite {}/{}, loss {}".format(
                    epoch, i, len(train_loader), loss.item()))

        eval_loss, eval_acc = Bayes_ensemble(test_loader, bayesian_net)
        print("Epoch {}, eval loss {}, eval acc {}".format(
            epoch, eval_loss, eval_acc))
                        batch_loader.chars_vocab_size)

    neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
    if args.use_cuda:
        neg_loss = neg_loss.cuda()

    # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):

        input_idx, target_idx = batch_loader.next_embedding_seq(args.batch_size)

        input = Variable(t.from_numpy(input_idx).long())
        target = Variable(t.from_numpy(target_idx).long())
        if args.use_cuda:
            input, target = input.cuda(), target.cuda()

        out = neg_loss(input, target, args.num_sample).mean()

        optimizer.zero_grad()
        out.backward()
        optimizer.step()

        if iteration % 500 == 0:
            out = out.cpu().data.numpy()[0]
            print('iteration = {}, loss = {}'.format(iteration, out))

    word_embeddings = neg_loss.input_embeddings()
    np.save('data/word_embeddings.npy', word_embeddings)