Exemple #1
0
def train_ensamble():
    """ train func of weighted ensamble
    """

    X_train, y_train, X_test, y_test = load_train_data(
        preds_list=p_list, mode='train')
    train_dataset = MyDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
        sampler=None)
    val_dataset = MyDataset(X_test, y_test)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True)

    print(f'[+] trainning with {len(train_dataset)} samples, '
          f' validation with {len(val_dataset)} samples')

    model = WeightedEnsambleModel(num_classes, len(p_list))
    criterion = torch.nn.CrossEntropyLoss().cuda()

    EPOCHS = 100
    min_loss = float("inf")
    lr = 0.001
    patience = 0

    for epoch in range(EPOCHS):
        print(f'[+] epoch {epoch}')

        if patience == 3:
            patience = 0
            model.load_state_dict(torch.load(best_checkpoint_file))
            lr /= 3
            print(f'[+] set lr={lr}')

        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

        # train for one epoch
        utils.train_one_epoch(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set after one epoch
        log_loss = utils.validate(val_loader, model, criterion)

        if log_loss < min_loss:
            torch.save(model.state_dict(), best_checkpoint_file)
            print(f'[+] lr = {lr}, val loss improved from {min_loss:.5f} to {log_loss:.5f}. Saved!')
            min_loss = log_loss
            patience = 0
        else:
            patience += 1

    print(f'[*] trainning done with {EPOCHS} epochs')
Exemple #2
0
def main(args):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    print("load dataset")
    num_classes = 2
    data = HandDataset(args.data_path, utils.get_transform(train=True))

    indices = torch.randperm(len(data)).tolist()
    test_cnt = int(len(data) / 10)
    dataset = torch.utils.data.Subset(data, indices[:-test_cnt])
    dataset_test = torch.utils.data.Subset(dataset, indices[-test_cnt:])

    # 定义训练和验证数据加载器
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=2,
        shuffle=True,
        num_workers=4,
        collate_fn=lambda x: tuple(zip(*x)))

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=1,
        shuffle=False,
        num_workers=4,
        collate_fn=lambda x: tuple(zip(*x)))

    print("load model")
    model = MaskRcnn.get_pretrained_resnet50_model(num_classes)

    model.to(device)

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)
    print("begin train")
    num_epochs = 10

    for epoch in range(num_epochs):
        utils.train_one_epoch(model,
                              optimizer,
                              data_loader,
                              device,
                              epoch,
                              print_freq=10)
    lr_scheduler.step()
    # evaluate(model, data_loader_test, device=device)

    print("That's it!")
def train(lr=0.1,
          batch_size=64,
          max_epoch=700,
          rs=7,
          save=False,
          title='0',
          outdir='fig3',
          resume_model=None,
          resume_epoch=0,
          half_dataset=False):

    #experiment_dir = outdir
    experiment_dir = os.path.join('exp', title,
                                  datetime.now().strftime('%b%d_%H-%M-%S'))
    os.makedirs(experiment_dir, exist_ok=True)
    # Set the seed
    torch.manual_seed(rs)
    np.random.seed(rs)

    if torch.cuda.is_available():
        device = torch.device('cuda:0')
        #print("CUDA Recognized")
    else:
        device = torch.device('cpu')

    model = models.get_model('resnet18').to(device)

    if resume_model is not None:
        model = resume_model

    loaders = datasets.get_dataset(
        'first_half_cifar10' if half_dataset else 'cifar10',
        batch_size=batch_size)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    start = time.time()

    for epoch in range(resume_epoch, max_epoch + 1):
        print(f"Epoch {epoch}")
        train_loss, train_accuracy = train_one_epoch(device, model, optimizer,
                                                     criterion,
                                                     loaders["train_loader"])
        print("Train accuracy: {} Train loss: {}".format(
            train_accuracy, train_loss))
        test_accuracy = 0

        if not half_dataset:
            test_loss, test_accuracy = eval_on_dataloader(
                device, criterion, model, loaders['test_loader'])
            print("Test accuracy: {} Test loss: {}".format(
                test_accuracy, test_loss))

        if train_accuracy > 0.99:
            cost = time.time() - start
            return train_accuracy, test_accuracy, cost, model

    return 0, 0, 0, None
def main():

    args = parse_args()
    path = args.path
    dataset = args.dataset
    layers = eval(args.layers)
    weight_decay = args.weight_decay
    num_negatives_train = args.num_neg_train
    num_negatives_test = args.num_neg_test
    dropout = args.dropout
    learner = args.learner
    learning_rate = args.lr
    batch_size = args.batch_size
    epochs = args.epochs
    verbose = args.verbose

    topK = 10
    print("MLP arguments: %s " % (args))
    # model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time())

    # Load data

    t1 = time()
    full_dataset = MovieDataset(
        path + dataset, num_negatives_train=num_negatives_train, num_negatives_test=num_negatives_test)
    train, testRatings, testNegatives = full_dataset.trainMatrix, full_dataset.testRatings, full_dataset.testNegatives
    num_users, num_items = train.shape
    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
          % (time()-t1, num_users, num_items, train.nnz, len(testRatings)))

    training_data_generator = DataLoader(
        full_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    # Build model
    model = MLP(num_users, num_items, layers=layers, dropout=dropout)
    # Transfer the model to GPU, if one is available
    model.to(device)
    if verbose:
        print(model)

    loss_fn = torch.nn.BCELoss()
    # Use Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

    # Record performance
    hr_list = []
    ndcg_list = []
    BCE_loss_list = []

    # Check Init performance
    hr, ndcg = test(model, full_dataset, topK)
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    BCE_loss_list.append(1)ß
    # do the epochs now

    for epoch in range(epochs):
        epoch_loss = train_one_epoch( model, training_data_generator, loss_fn, optimizer, epoch, device)

        if epoch % verbose == 0:
            hr, ndcg = test(model, full_dataset, topK)
            hr_list.append(hr)
            ndcg_list.append(ndcg)
            BCE_loss_list.append(epoch_loss)
            # if hr > best_hr:
            #     best_hr, best_ndcg, best_iter = hr, ndcg, epoch
            #     if args.out > 0:
            #         model.save(model_out_file, overwrite=True)
    print("hr for epochs: ", hr_list)
    print("ndcg for epochs: ", ndcg_list)
    print("loss for epochs: ", BCE_loss_list)
    

    best_iter = np.argmax(np.array(hr_list))
    best_hr = hr_list[best_iter]
    best_ndcg = ndcg_list[best_iter]
    print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %
          (best_iter, best_hr, best_ndcg))
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")

    if os.path.exists("./weights") is False:
        os.makedirs("./weights")

    tb_writer = SummaryWriter()

    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(
        args.data_path)

    img_size = 224
    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(img_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        "val":
        transforms.Compose([
            transforms.Resize(int(img_size * 1.143)),
            transforms.CenterCrop(img_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    }

    # 实例化训练数据集
    train_dataset = MyDataSet(images_path=train_images_path,
                              images_class=train_images_label,
                              transform=data_transform["train"])

    # 实例化验证数据集
    val_dataset = MyDataSet(images_path=val_images_path,
                            images_class=val_images_label,
                            transform=data_transform["val"])

    batch_size = args.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=nw,
        collate_fn=train_dataset.collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=nw,
                                             collate_fn=val_dataset.collate_fn)

    model = create_model(num_classes=args.num_classes).to(device)

    if args.weights != "":
        assert os.path.exists(
            args.weights), "weights file: '{}' not exist.".format(args.weights)
        weights_dict = torch.load(args.weights, map_location=device)["model"]
        # 删除有关分类类别的权重
        for k in list(weights_dict.keys()):
            if "head" in k:
                del weights_dict[k]
        print(model.load_state_dict(weights_dict, strict=False))

    if args.freeze_layers:
        for name, para in model.named_parameters():
            # 除head外,其他权重全部冻结
            if "head" not in name:
                para.requires_grad_(False)
            else:
                print("training {}".format(name))

    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.AdamW(pg, lr=args.lr, weight_decay=5E-2)

    for epoch in range(args.epochs):
        # train
        train_loss, train_acc = train_one_epoch(model=model,
                                                optimizer=optimizer,
                                                data_loader=train_loader,
                                                device=device,
                                                epoch=epoch)

        # validate
        val_loss, val_acc = evaluate(model=model,
                                     data_loader=val_loader,
                                     device=device,
                                     epoch=epoch)

        tags = [
            "train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"
        ]
        tb_writer.add_scalar(tags[0], train_loss, epoch)
        tb_writer.add_scalar(tags[1], train_acc, epoch)
        tb_writer.add_scalar(tags[2], val_loss, epoch)
        tb_writer.add_scalar(tags[3], val_acc, epoch)
        tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

        torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")

    print(args)
    print(
        'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/'
    )
    tb_writer = SummaryWriter()
    if os.path.exists("./weights") is False:
        os.makedirs("./weights")

    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(
        args.data_path)

    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        "val":
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    }

    # 实例化训练数据集
    train_dataset = MyDataSet(images_path=train_images_path,
                              images_class=train_images_label,
                              transform=data_transform["train"])

    # 实例化验证数据集
    val_dataset = MyDataSet(images_path=val_images_path,
                            images_class=val_images_label,
                            transform=data_transform["val"])

    batch_size = args.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=nw,
        collate_fn=train_dataset.collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=nw,
                                             collate_fn=val_dataset.collate_fn)

    # 如果存在预训练权重则载入
    model = densenet121(num_classes=args.num_classes).to(device)
    if args.weights != "":
        if os.path.exists(args.weights):
            load_state_dict(model, args.weights)
        else:
            raise FileNotFoundError("not found weights file: {}".format(
                args.weights))

    # 是否冻结权重
    if args.freeze_layers:
        for name, para in model.named_parameters():
            # 除最后的全连接层外,其他权重全部冻结
            if "classifier" not in name:
                para.requires_grad_(False)

    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.SGD(pg,
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1E-4,
                          nesterov=True)
    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (
        1 - args.lrf) + args.lrf  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    for epoch in range(args.epochs):
        # train
        mean_loss = train_one_epoch(model=model,
                                    optimizer=optimizer,
                                    data_loader=train_loader,
                                    device=device,
                                    epoch=epoch)

        scheduler.step()

        # validate
        acc = evaluate(model=model, data_loader=val_loader, device=device)

        print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3)))
        tags = ["loss", "accuracy", "learning_rate"]
        tb_writer.add_scalar(tags[0], mean_loss, epoch)
        tb_writer.add_scalar(tags[1], acc, epoch)
        tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch)

        torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args):
    print("Running with arguments:")
    args_dict = {}
    for key in vars(args):
        if key == "default_function":
            continue
        args_dict[key] = getattr(args, key)
        print(key, ": ", args_dict[key])
    print("---")

    experiment_time = datetime.now().strftime('%b%d_%H-%M-%S')
    experiment_dir = os.path.join('exp', args.title, experiment_time)
    os.makedirs(experiment_dir)
    with open(os.path.join(experiment_dir, "config.json"), "w") as f:
        json.dump(args_dict, f, indent=4, sort_keys=True, default=lambda x: x.__name__)



    if torch.cuda.is_available():
        device = torch.device(f'cuda:{args.gpu_id}')
        print("CUDA Recognized")
    else:
        device = torch.device('cpu')

    # Training with Random Initialization 
    overal_result = {}
    init_type = "random"
    dataset_result = {}
    for (dataset_name, num_classes) in [("cifar10", 10), ("cifar100", 100), ("svhn", 10)]:
        model_args = {
            "resnet18": {"num_classes": num_classes},
            "mlp": {"input_dim": 32 * 32 * 3, "num_classes": num_classes, 'activation':'tanh', 'bias':True},
            "logistic": {"input_dim": 32 * 32 * 3, "num_classes": num_classes},
        }
        
        optimizer_result = {}
        for optimizer_name in ["adam", "sgd", "sgd-momentum"]:
            model_result = {}
            for model_name in ["mlp", "logistic", "resnet18"]:
                print(f"Training model {model_name} on {dataset_name} with {optimizer_name} optimizer.")
                torch.manual_seed(args.random_seed)
                np.random.seed(args.random_seed)
                model = models.get_model(model_name, **model_args[model_name]).to(device)
                loaders = datasets.get_dataset(dataset_name)
                criterion = torch.nn.CrossEntropyLoss()

                if optimizer_name == "adam":
                    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
                elif optimizer_name == "sgd-momentum":
                    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
                else:
                    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
            
                train_accuracies = []
                stop_indicator = False
                epoch = 0
                while(not stop_indicator):
                    if epoch % 5 == 0:
                        print(f"\t Training in epoch {epoch + 1} \t")
                    train_loss, train_accuracy = train_one_epoch(device, model, optimizer, criterion, loaders['train_loader'])
                    train_loss, train_accuracy = eval_on_dataloader(device, criterion, model, loaders['train_loader'])

                    train_accuracies.append(train_accuracy)
                    epoch += 1
                    if train_accuracy >= 0.99:
                        print("Convergence codition met. Training accuracy > 0.99")
                        stop_indicator = True
                    
                    if len(train_accuracies) >= args.convergence_epochs:
                        if np.std(train_accuracies[-args.convergence_epochs:]) < args.convergence_accuracy_change_threshold:
                            print(f"\tConvergence codition met. Training accuracy = {train_accuracy} stopped improving")
                            stop_indicator = True

                test_loss, test_accuracy =  eval_on_dataloader(device, criterion, model, loaders['test_loader'])
                print(f"\tTest accuracy = {test_accuracy}")
                model_result[model_name] = test_accuracy
                
            optimizer_result[optimizer_name] = model_result
        dataset_result[dataset_name] = optimizer_result
    overal_result[init_type] = dataset_result

    
    # Training with warm-start
    init_type = "warm-start"
    dataset_result = {}
    for (dataset_name, num_classes) in [("cifar10", 10), ("cifar100", 100), ("svhn", 10)]:
        model_args = {
            "resnet18": {"num_classes": num_classes},
            "mlp": {"input_dim": 32 * 32 * 3, "num_classes": num_classes},
            "logistic": {"input_dim": 32 * 32 * 3, "num_classes": num_classes},
        }
        
        optimizer_result = {}
        for optimizer_name in ["adam", "sgd", "sgd-momentum"]:
            model_result = {}
            for model_name in ["mlp", "logistic", "resnet18"]:
                print(f"Training model {model_name} on half of {dataset_name} with {optimizer_name} optimizer.")
                torch.manual_seed(args.random_seed)
                np.random.seed(args.random_seed)
                model = models.get_model(model_name, **model_args[model_name]).to(device)
                loaders = datasets.get_dataset(f"half_{dataset_name}")
                criterion = torch.nn.CrossEntropyLoss()

                if optimizer_name == "adam":
                    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
                elif optimizer_name == "sgd-momentum":
                    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
                else:
                    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
                    
                train_accuracies = []
                stop_indicator = False
                epoch = 0
                while(not stop_indicator):
                    if epoch % 5 == 0:
                        print(f"\tPre-training in epoch {epoch + 1}")
                    train_loss, train_accuracy = train_one_epoch(device, model, optimizer, criterion, loaders['train_loader'])
                    train_loss, train_accuracy = eval_on_dataloader(device, criterion, model, loaders['train_loader'])
                    
                    train_accuracies.append(train_accuracy)
                    epoch += 1
                    if train_accuracy >= 0.99:
                        print("Convergence codition met. Training accuracy > 0.99")
                        stop_indicator = True
                    
                    if len(train_accuracies) >= args.convergence_epochs:
                        if np.std(train_accuracies[-args.convergence_epochs:]) < args.convergence_accuracy_change_threshold:
                            print(f"\tConvergence codition met. Training accuracy = {train_accuracy} stopped improving")
                            stop_indicator = True
                            
                loaders = datasets.get_dataset(f"{dataset_name}")
                criterion = torch.nn.CrossEntropyLoss()
                train_accuracies = []
                stop_indicator = False
                epoch = 0
                while(not stop_indicator):
                    if epoch % 5 == 0:
                        print(f"\t Training in epoch {epoch + 1}")
                    train_loss, train_accuracy = train_one_epoch(device, model, optimizer, criterion, loaders['train_loader'])
                    train_loss, train_accuracy = eval_on_dataloader(device, criterion, model, loaders['train_loader'])
                    
                    train_accuracies.append(train_accuracy)
                    epoch += 1
                    if train_accuracy >= 0.99:
                        print("Convergence codition met. Training accuracy > 0.99")
                        stop_indicator = True
                    
                    if len(train_accuracies) >= args.convergence_epochs:
                        if np.std(train_accuracies[-args.convergence_epochs:]) < args.convergence_accuracy_change_threshold:
                            print(f"\tConvergence codition met. Training accuracy = {train_accuracy} stopped improving")
                            stop_indicator = True

                test_loss, test_accuracy =  eval_on_dataloader(device, criterion, model, loaders['test_loader'])
                print(f"\tTest accuracy = {test_accuracy}")
                model_result[model_name] = test_accuracy
                
            optimizer_result[optimizer_name] = model_result
        dataset_result[dataset_name] = optimizer_result
    overal_result[init_type] = dataset_result

                           
    np.save(f"tables/table1-svhn-seed{args.random_seed}.npy", overal_result)
Exemple #8
0
            if ("features.top" not in name) and ("classifier" not in name):
                para.requires_grad_(False)
            else:
                print("training {}".format(name))

    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=1E-4)
    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (1 - args.lrf) + args.lrf  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    for epoch in range(args.epochs):
        # train
        mean_loss = train_one_epoch(model=model,
                                    optimizer=optimizer,
                                    data_loader=train_loader,
                                    device=device,
                                    epoch=epoch)

        scheduler.step()

        # validate
        sum_num = evaluate(model=model,
                           data_loader=val_loader,
                           device=device)
        acc = sum_num / len(val_data_set)
        print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3)))
        tags = ["loss", "accuracy", "learning_rate"]
        tb_writer.add_scalar(tags[0], mean_loss, epoch)
        tb_writer.add_scalar(tags[1], acc, epoch)
        tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch)
Exemple #9
0
def train(train_data, use_asymm_gen_loss=True, use_gpu=False):
    """

    :param train_data: np.ndarray of shape (20000, 1)
    :param use_asymm_gen_loss: bool
    :param use_gpu: bool
    :return:
    """
    """ Build training configurations """

    hp = dict(n_epochs=20, batch_size=64, n_disc_updates=2)
    hp = EasyDict(hp)

    constant = dict(device=torch.device("cpu" if not use_gpu else "cuda:0"))
    constant = EasyDict(constant)
    if use_gpu:
        torch.cuda.set_device(constant.device)
    """ Build data loader and data processor function """

    train_loader = data.DataLoader(dataset=train_data,
                                   batch_size=hp.batch_size,
                                   shuffle=True)
    """ Build networks """

    gen = Generator().to(constant.device)
    disc = Discriminator().to(constant.device)
    """ Build optimizers """

    optimizer_g = torch.optim.Adam(gen.parameters(), lr=1e-4, betas=(0, 0.9))
    optimizer_d = torch.optim.Adam(disc.parameters(), lr=1e-4, betas=(0, 0.9))
    """ Build loss functions """
    def disc_loss_fn(real, fake):
        return (-disc(real.detach()).log().mean() -
                (1 - disc(fake.detach())).log().mean())

    if use_asymm_gen_loss:

        def gen_loss_fn(real, fake):
            return -disc(fake).log().mean()

    else:

        def gen_loss_fn(real, fake):
            return (1 - disc(fake)).log().mean()

    """ Traning loop """

    history = dict(losses=[])
    for epoch in range(hp.n_epochs):

        losses_one_epoch = train_one_epoch(
            n_disc_updates=hp.n_disc_updates,
            batch_iterator=train_loader,
            process_batch_fn=process_batch_fn,
            gen=gen,
            disc=disc,
            optimizer_g=optimizer_g,
            optimizer_d=optimizer_d,
            gen_loss_fn=gen_loss_fn,
            disc_loss_fn=disc_loss_fn,
            device=constant.device,
            # max_n_iterations=1,  # uncomment this line if trying to debug
        )
        history["losses"].extend(losses_one_epoch)

        print(f"Epoch {epoch}: loss = {np.mean(losses_one_epoch)}")

        if epoch == 0 or epoch == hp.n_epochs - 1:
            fake, disc_in, disc_out = eval_one_epoch(gen=gen,
                                                     disc=disc,
                                                     device=constant.device)
            plot_eval(
                real=train_data,
                fake=fake,
                disc_in=disc_in,
                disc_out=disc_out,
                epoch=epoch,
            )

    history["losses"] = torch.stack(history["losses"]).to("cpu").numpy()

    plot_loss_curve(
        history["losses"],
        title="train_minimal",
        save_to=os.path.join(DATA_DIR, "train_minimal_loss"),
    )
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")

    if os.path.exists("./weights") is False:
        os.makedirs("./weights")

    tb_writer = SummaryWriter()

    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(
        args.data_path)

    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
        ]),
        "val":
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
        ])
    }

    # 实例化训练数据集
    train_dataset = MyDataSet(images_path=train_images_path,
                              images_class=train_images_label,
                              transform=data_transform["train"])

    # 实例化验证数据集
    val_dataset = MyDataSet(images_path=val_images_path,
                            images_class=val_images_label,
                            transform=data_transform["val"])

    batch_size = args.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=nw,
        collate_fn=train_dataset.collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=nw,
                                             collate_fn=val_dataset.collate_fn)

    model = create_model(num_classes=5, has_logits=False).to(device)

    if args.weights != "":
        assert os.path.exists(
            args.weights), "weights file: '{}' not exist.".format(args.weights)
        weights_dict = torch.load(args.weights, map_location=device)
        # 删除不需要的权重
        del_keys = ['head.weight', 'head.bias'] if model.has_logits \
            else ['pre_logits.fc.weight', 'pre_logits.fc.bias', 'head.weight', 'head.bias']
        for k in del_keys:
            del weights_dict[k]
        print(model.load_state_dict(weights_dict, strict=False))

    if args.freeze_layers:
        for name, para in model.named_parameters():
            # 除head, pre_logits外,其他权重全部冻结
            if "head" not in name and "pre_logits" not in name:
                para.requires_grad_(False)
            else:
                print("training {}".format(name))

    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=5E-5)
    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (
        1 - args.lrf) + args.lrf  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    for epoch in range(args.epochs):
        # train
        train_loss, train_acc = train_one_epoch(model=model,
                                                optimizer=optimizer,
                                                data_loader=train_loader,
                                                device=device,
                                                epoch=epoch)

        scheduler.step()

        # validate
        val_loss, val_acc = evaluate(model=model,
                                     data_loader=val_loader,
                                     device=device,
                                     epoch=epoch)

        tags = [
            "train_loss", "train_acc", "val_loss", "val_acc", "learning_rate"
        ]
        tb_writer.add_scalar(tags[0], train_loss, epoch)
        tb_writer.add_scalar(tags[1], train_acc, epoch)
        tb_writer.add_scalar(tags[2], val_loss, epoch)
        tb_writer.add_scalar(tags[3], val_acc, epoch)
        tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch)

        torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
Exemple #11
0
#model = model.to(device)
criterion = torch.nn.MSELoss()
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                    mode='min',
                                                    patience=lr_patience)
#train_losses, valid_losses = [], []
#%% create save path
savepath = checkpoint_path if (is_resuming and checkpoint_path
                               and not is_transfer) else create_savepath(
                                   rootpath=save_root_path, is_debug=is_debug)
#if is_resuming:
#    savepath = checkpoint_path
#else:
#    savepath = create_savepath(savepath=save_root_path,is_debug=is_debug)
for epoch in range(epoch_start, epoch_start + num_epochs):
    train_loss = train_one_epoch(model, optimizer, criterion, train_loader,
                                 epoch, device, epoch_start + num_epochs)
    train_losses.append(train_loss)
    valid_loss = validate(model, criterion, valid_loader, device)
    valid_losses.append(valid_loss)

    lr_scheduler.step(valid_loss)
    #print('epoch {} done!'.format(epoch))

    save_checkpoint(savepath=savepath,
                    epoch=epoch,
                    model=model,
                    optimizer=optimizer,
                    train_losses=train_losses,
                    valid_losses=valid_losses,
                    lr=lr,
                    lr_patience=lr_patience,
def main(args):
    print("Running with arguments:")
    args_dict = {}
    for key in vars(args):
        if key == "default_function":
            continue
        args_dict[key] = getattr(args, key)
        print(key, ": ", args_dict[key])
    print("---")

    experiment_time = datetime.now().strftime('%b%d_%H-%M-%S')
    if args.exp_dir:
        experiment_dir = args.exp_dir
    else:
        experiment_dir = os.path.join('exp', args.title, experiment_time)
    os.makedirs(experiment_dir, exist_ok=True)
    with open(os.path.join(experiment_dir, "config.json"), "w") as f:
        json.dump(args_dict,
                  f,
                  indent=4,
                  sort_keys=True,
                  default=lambda x: x.__name__)

    if torch.cuda.is_available():
        device = torch.device('cuda:0')
        print("CUDA Recognized")
    else:
        device = torch.device('cpu')

    try:
        summary_writer = SummaryWriter(logdir=experiment_dir)
    except:
        summary_writer = SummaryWriter(experiment_dir)

    print("Starting Online Learning")
    #Online learning setup
    torch.manual_seed(args.random_seed)
    np.random.seed(args.random_seed)
    model = models.get_model(args.model).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    loaders = datasets.get_dataset("online_with_val_cifar10",
                                   split_size=args.split_size)
    number_of_samples_online = []
    test_accuracies_online = []
    training_times_online = []
    epoch = 0
    for i, train_loader in enumerate(loaders['train_loaders']):
        t_start = datetime.now()
        n_train = (i + 1) * args.split_size
        number_of_samples_online.append(n_train)
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        random_model = models.get_model(args.model).to(device)
        with torch.no_grad():
            for real_parameter, random_parameter in zip(
                    model.parameters(), random_model.parameters()):
                real_parameter.mul_(args.checkpoint_shrink).add_(
                    random_parameter, alpha=args.checkpoint_perturb)

        train_accuracies = []
        while True:
            if epoch % 5 == 0:
                print(f"Starting training in epoch {epoch + 1}")
            train_loss, train_accuracy = train_one_epoch(
                device, model, optimizer, criterion, train_loader)
            val_loss, val_accuracy = eval_on_dataloader(
                device, criterion, model, loaders['val_loader'])
            test_loss, test_accuracy = eval_on_dataloader(
                device, criterion, model, loaders['test_loader'])
            train_accuracies.append(train_accuracy)
            epoch += 1
            summary_writer.add_scalar("test_accuracy", test_accuracy, epoch)
            summary_writer.add_scalar("test_loss", test_loss, epoch)
            summary_writer.add_scalar("train_accuracy", train_accuracy, epoch)
            summary_writer.add_scalar("train_loss", train_loss, epoch)
            summary_writer.add_scalar("val_accuracy", val_accuracy, epoch)
            summary_writer.add_scalar("val_loss", val_loss, epoch)
            #if len(train_accuracies) >= args.convergence_epochs and \
            #        max(train_accuracies) not in train_accuracies[-args.convergence_epochs:]:
            if train_accuracy >= 0.99:
                print("Convergence condition met")
                break

        val_loss, val_accuracy = eval_on_dataloader(device, criterion, model,
                                                    loaders['val_loader'])
        test_loss, test_accuracy = eval_on_dataloader(device, criterion, model,
                                                      loaders['test_loader'])
        summary_writer.add_scalar("online_val_accuracy", val_accuracy, n_train)
        summary_writer.add_scalar("online_val_loss", val_loss, n_train)
        summary_writer.add_scalar("online_test_accuracy", test_accuracy,
                                  n_train)
        summary_writer.add_scalar("online_test_loss", test_loss, n_train)
        t_end = datetime.now()
        training_time = (t_end - t_start).total_seconds()

        training_times_online.append(training_time)
        summary_writer.add_scalar("online_train_time", training_time, n_train)

    summary_writer.close()
Exemple #13
0
def train(train_data, use_gpu=False):
    """

    :param train_data: np.array of shape (None, 3, 32, 32) with values in [0, 1]
    :param use_gpu:
    :return:
    """

    """ Build training configurations """

    hp = dict(
        n_iterations=25000,
        batch_size=256,
        n_disc_updates=5,
        lmbda=10,
    )
    hp = EasyDict(hp)

    constant = dict(device=torch.device("cpu" if not use_gpu else "cuda:0"))
    constant = EasyDict(constant)
    if use_gpu:
        torch.cuda.set_device(constant.device)

    """ Build data loader and data processor function """

    train_loader = data.DataLoader(dataset=train_data, batch_size=hp.batch_size, shuffle=True)
    n_batches = len(train_loader)
    hp.n_epochs = hp.n_iterations // n_batches
    hp.n_iterations = hp.n_epochs * n_batches
    print('n_epochs', hp.n_epochs, 'n_iterations', hp.n_iterations)

    """ Build networks """

    gen = Generator().to(constant.device)
    disc = Discriminator().to(constant.device)

    """ Build optimizers """

    optimizer_g = torch.optim.Adam(gen.parameters(), lr=2e-4, betas=(0, 0.9))
    optimizer_d = torch.optim.Adam(disc.parameters(), lr=2e-4, betas=(0, 0.9))

    """ Build loss functions """

    def disc_loss_fn(real, fake):
        current_batch_size = real.shape[0]
        real, fake = real.detach(), fake.detach()
        eps = torch.randn(current_batch_size, 1, 1, 1).to(constant.device)
        x_hat = (eps * real + (1 - eps) * fake).requires_grad_()

        disc_out = disc(x_hat)
        original_disc_loss = disc_out.mean() - disc(real).mean()

        grad, = torch.autograd.grad(
            outputs=[disc_out.mean(), ],
            inputs=x_hat,
            create_graph=True,
            retain_graph=True,
        )
        grad_penalty = (grad.norm() - 1).square()
        return original_disc_loss + hp.lmbda * grad_penalty

    def gen_loss_fn(real, fake):
        return -disc(fake).log().mean()

    """ Build learning rate schedulers """

    max_n_iterations = max(hp.n_iterations, 25000)
    scheduler_g = torch.optim.lr_scheduler.LambdaLR(
        optimizer=optimizer_g,
        lr_lambda=lambda itr: (max_n_iterations - itr) / max_n_iterations,
        last_epoch=-1
    )
    scheduler_d = torch.optim.lr_scheduler.LambdaLR(
        optimizer=optimizer_d,
        lr_lambda=lambda itr: (max_n_iterations - itr) / max_n_iterations,
        last_epoch=-1,
    )

    """ Training loop """

    history = dict(
        losses=[],
    )

    for epoch in tqdm(range(hp.n_epochs)):
        losses_one_epoch = train_one_epoch(
            n_disc_updates=hp.n_disc_updates,
            batch_iterator=train_loader,
            process_batch_fn=process_batch_fn,
            gen=gen, disc=disc, optimizer_g=optimizer_g, optimizer_d=optimizer_d,
            gen_loss_fn=gen_loss_fn,
            disc_loss_fn=disc_loss_fn,
            device=constant.device,
            scheduler_g=scheduler_g, scheduler_d=scheduler_d,
            # max_n_iterations=1,  # debug
        )
        history['losses'].extend(losses_one_epoch)

        print(f"Epoch {epoch}: loss = {np.mean(losses_one_epoch)}")

        if epoch == hp.n_epochs - 1:
            fake = eval_one_epoch(gen=gen, disc=disc, device=constant.device)
            plot_eval(fake=fake, epoch=epoch)

    history['losses'] = torch.stack(history['losses']).to('cpu').numpy()

    plot_loss_curve(
        history["losses"],
        title="train_cifar10",
        save_to=os.path.join(DATA_DIR, "train_cifar10_loss"),
    )
Exemple #14
0
# for plotting
mvn = torch.distributions.Normal(0, 1)
z_norm = mvn.sample([num_samples, np.ceil(n_in).astype(int)])
val_batch = next(iter(val_loader)).float()

start = time.time()
# for early stopping
i = 0
max_loss = np.inf
epochs_list = []
train_losses = []
val_losses = []
for epoch in range(1, epochs):
    epochs_list.append(epoch)
    train_loss = train_one_epoch(model, epoch, optimizer, train_loader)
    val_loss = val(model, train, val_loader)
    train_losses.append(train_loss)
    # val_losses.append(val_loss)
    val_loss = 100
    if val_loss < max_loss:
        max_loss = val_loss
        i = 0
        torch.save(
            model,
            (path + "model.pt"),
        )
    else:
        i += 1
    if i >= 30:
        break
Exemple #15
0
    testloader = DataLoader(testdataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers)

    best_acc1 = 0
    best_acc2 = 0
    best_loss = np.inf
    counter = 0

    for epoch in range(args.epochs):

        print(f'Epoch {epoch+1}/{args.epochs}')

        train_loss, _, _, _, _ = train_one_epoch(trainloader, model, optimizer,
                                                 criterion, device, scaler,
                                                 args, idx2ans)
        val_loss, predictions, val_acc, val_bleu = validate(
            valloader, model, criterion, device, scaler, args, val_df, idx2ans)
        test_loss, predictions, acc, bleu = test(testloader, model, criterion,
                                                 device, scaler, args, test_df,
                                                 idx2ans)

        scheduler.step(val_loss)

        if not args.category:

            log_dict = acc

            for k, v in bleu.items():
                log_dict[k] = v
Exemple #16
0
def main(args):
    print("Running with arguments:")
    args_dict = {}
    for key in vars(args):
        if key == "default_function":
            continue
        args_dict[key] = getattr(args, key)
        print(key, ": ", args_dict[key])
    print("---")

    experiment_time = datetime.now().strftime('%b%d_%H-%M-%S')
    experiment_dir = os.path.join('exp', args.title, experiment_time)
    os.makedirs(experiment_dir)
    with open(os.path.join(experiment_dir, "config.json"), "w") as f:
        json.dump(args_dict,
                  f,
                  indent=4,
                  sort_keys=True,
                  default=lambda x: x.__name__)

    if torch.cuda.is_available():
        device = torch.device('cuda:0')
        print("CUDA Recognized")
    else:
        device = torch.device('cpu')

    try:
        summary_writer = SummaryWriter(logdir=experiment_dir)
    except:
        summary_writer = SummaryWriter(experiment_dir)

    print("Starting Online Learning")
    #Online learning setup
    torch.manual_seed(args.random_seed)
    np.random.seed(args.random_seed)
    model = models.get_model(args.model).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    loaders = datasets.get_dataset(f"online_with_val_{args.dataset}",
                                   split_size=args.split_size)
    number_of_samples_online = []
    test_accuracies_online = []
    training_times_online = []
    for i, train_loader in enumerate(loaders['train_loaders']):
        t_start = datetime.now()
        n_train = (i + 1) * args.split_size
        number_of_samples_online.append(n_train)
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        print(f"Warm-Start Training with {n_train} data.")

        train_accuracies = []
        stop_indicator = False
        epoch = 0
        while (not stop_indicator):
            if epoch % 5 == 0:
                print(f"Starting training in epoch {epoch + 1}")
            train_loss, train_accuracy = train_one_epoch(
                device, model, optimizer, criterion, train_loader)
            train_loss, train_accuracy = eval_on_dataloader(
                device, criterion, model, train_loader)
            train_accuracies.append(train_accuracy)
            epoch += 1

            if train_accuracy >= args.acc_threshold:
                print(
                    f"Convergence codition met. Training accuracy > {100 * args.acc_threshold}"
                )
                stop_indicator = True

        t_end = datetime.now()
        training_time = (t_end - t_start).total_seconds()
        test_loss, test_accuracy = eval_on_dataloader(device, criterion, model,
                                                      loaders['test_loader'])
        test_accuracies_online.append(test_accuracy)
        training_times_online.append(training_time)
        summary_writer.add_scalar("test_accuracy_online", test_accuracy,
                                  n_train)
        summary_writer.add_scalar("train_time_online", training_time, n_train)

    print("Starting Offline Learning")
    # Offline learning setup
    n_experiments = len(loaders['train_loaders'])
    number_of_samples_offline = []
    test_accuracies_offline = []
    training_times_offline = []
    for i in range(1, n_experiments + 1):
        t_start = datetime.now()
        n_train = i * args.split_size
        number_of_samples_offline.append(n_train)
        print(f"Running {i}_th experiment with Train size = {n_train}")

        # Set the seed
        torch.manual_seed(args.random_seed)
        np.random.seed(args.random_seed)
        loaders = datasets.get_dataset(f"partial_with_val_{args.dataset}",
                                       n_train)
        model = models.get_model(args.model).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        criterion = torch.nn.CrossEntropyLoss()

        train_accuracies = []
        stop_indicator = False
        epoch = 0
        while (not stop_indicator):
            if epoch % 5 == 0:
                print(f"Starting training in epoch {epoch + 1}")
            train_loss, train_accuracy = train_one_epoch(
                device, model, optimizer, criterion, loaders['train_loader'])
            #             val_loss, val_accuracy =  eval_on_dataloader(model, loaders['val_loader'])
            train_loss, train_accuracy = eval_on_dataloader(
                device, criterion, model,
                loaders['train_loader'])  # To get model's final accuracy
            train_accuracies.append(train_accuracy)
            epoch += 1

            if train_accuracy >= args.acc_threshold:
                print(
                    f"Convergence codition met. Training accuracy > {100 * args.acc_threshold}"
                )
                stop_indicator = True

        t_end = datetime.now()
        training_time = (t_end - t_start).total_seconds()
        test_loss, test_accuracy = eval_on_dataloader(device, criterion, model,
                                                      loaders['test_loader'])
        test_accuracies_offline.append(test_accuracy)
        training_times_offline.append(training_time)
        summary_writer.add_scalar("test_accuracy_offline", test_accuracy,
                                  n_train)
        summary_writer.add_scalar("train_time_offline", training_time, n_train)

    import matplotlib.pyplot as plt
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))

    number_of_samples_online = np.array(number_of_samples_online) / 1000
    number_of_samples_offline = np.array(number_of_samples_offline) / 1000
    axs[0].plot(number_of_samples_online,
                test_accuracies_online,
                label='warm start',
                color='C0')
    axs[0].plot(number_of_samples_offline,
                test_accuracies_offline,
                label='random',
                color='C1')
    axs[0].set_ylabel("Tets Accuracy")
    axs[0].set_xlabel("Number of Samples (thousands)")
    axs[1].plot(number_of_samples_online,
                training_times_online,
                label='warm start',
                color='C0')
    axs[1].plot(number_of_samples_offline,
                training_times_offline,
                label='random',
                color='C1')
    axs[1].set_ylabel("Train Time (seconds)")
    axs[1].set_xlabel("Number of Samples (thousands)")
    plt.legend()
    plt.savefig(
        f"figures/figure2-{args.dataset}-{100 * args.acc_threshold}.pdf")
Exemple #17
0
def main(args):
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")

    print(args)
    print(
        'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/'
    )
    tb_writer = SummaryWriter()
    if os.path.exists("./weights") is False:
        os.makedirs("./weights")

    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(
        args.data_path)

    img_size = {
        "B0": 224,
        "B1": 240,
        "B2": 260,
        "B3": 300,
        "B4": 380,
        "B5": 456,
        "B6": 528,
        "B7": 600
    }
    num_model = "B0"

    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(img_size[num_model]),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        "val":
        transforms.Compose([
            transforms.Resize(img_size[num_model]),
            transforms.CenterCrop(img_size[num_model]),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    }

    # 实例化训练数据集
    train_data_set = MyDataSet(images_path=train_images_path,
                               images_class=train_images_label,
                               transform=data_transform["train"])

    # 实例化验证数据集
    val_data_set = MyDataSet(images_path=val_images_path,
                             images_class=val_images_label,
                             transform=data_transform["val"])

    batch_size = args.batch_size
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))
    train_loader = torch.utils.data.DataLoader(
        train_data_set,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=nw,
        collate_fn=train_data_set.collate_fn)

    val_loader = torch.utils.data.DataLoader(
        val_data_set,
        batch_size=batch_size,
        shuffle=False,
        pin_memory=True,
        num_workers=nw,
        collate_fn=val_data_set.collate_fn)

    # 如果存在预训练权重则载入
    model = create_model(num_classes=args.num_classes).to(device)
    if os.path.exists(args.weights):
        weights_dict = torch.load(args.weights, map_location=device)
        load_weights_dict = {
            k: v
            for k, v in weights_dict.items()
            if model.state_dict()[k].numel() == v.numel()
        }
        print(model.load_state_dict(load_weights_dict, strict=False))

    # 是否冻结权重
    if args.freeze_layers:
        for name, para in model.named_parameters():
            # 除最后一个卷积层和全连接层外,其他权重全部冻结
            if ("features.top" not in name) and ("classifier" not in name):
                para.requires_grad_(False)
            else:
                print("training {}".format(name))

    pg = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=1E-4)
    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
    lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (
        1 - args.lrf) + args.lrf  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    for epoch in range(args.epochs):
        # train
        mean_loss = train_one_epoch(model=model,
                                    optimizer=optimizer,
                                    data_loader=train_loader,
                                    device=device,
                                    epoch=epoch)

        scheduler.step()

        # validate
        sum_num = evaluate(model=model, data_loader=val_loader, device=device)
        acc = sum_num / len(val_data_set)
        print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3)))
        tags = ["loss", "accuracy", "learning_rate"]
        tb_writer.add_scalar(tags[0], mean_loss, epoch)
        tb_writer.add_scalar(tags[1], acc, epoch)
        tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch)

        torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
Exemple #18
0
    num_epoch = args.max_epoch
    loss_rec = {"train": [], "valid": []}
    acc_rec = {"train": [], "valid": []}
    best_acc, best_epoch = 0, 0

    LR = args.lr
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=LR,
                                momentum=0.9,
                                weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     gamma=0.1,
                                                     milestones=[92, 136])
    for epoch in range(num_epoch):
        loss_train, acc_train, mat_train = train_one_epoch(
            train_loader, model, criterion, optimizer, epoch, device)
        loss_valid, acc_valid, mat_valid = valid_one_epoch(
            test_loader, model, criterion, device)
        print(
            "Epoch[{:0>3}/{:0>3}] Train Acc: {:.2%} Valid Acc:{:.2%} Train loss:{:.4f} Valid loss:{:.4f} LR:{}"
            .format(epoch + 1, num_epoch, acc_train, acc_valid, loss_train,
                    loss_valid, optimizer.param_groups[0]["lr"]))

        if 'patience' in dir(scheduler):
            scheduler.step(acc_valid)  # ReduceLROnPlateau
        else:
            scheduler.step()  # StepLR

        loss_rec["train"].append(loss_train), loss_rec["valid"].append(
            loss_valid)
        acc_rec["train"].append(acc_train), acc_rec["valid"].append(acc_valid)