Example #1
0
def main():
    start_epoch = args.start_epoch  # start from epoch 0 or last checkpoint epoch

    # Data
    print('==> Preparing dataset %s' % args.dataset)
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
    if args.dataset == 'cifar10':
        dataloader = datasets.CIFAR10
        num_classes = 10
    else:
        dataloader = datasets.CIFAR100
        num_classes = 100


    trainset = dataloader(root=args.dataroot, train=True, download=True, transform=transform_train)
    sampler = torch.utils.data.distributed.DistributedSampler(trainset,num_replicas=hvd.size(), rank=hvd.rank())
    trainloader = data.DataLoader(dataset=trainset, batch_size=args.train_batch * world_size, shuffle=False, sampler=sampler)

    testset = dataloader(root=args.dataroot, train=False, download=False, transform=transform_test)
    testloader = data.DataLoader(testset, batch_size=args.test_batch * world_size, shuffle=False, num_workers=args.workers)

    # Model
    print("==> creating model '{}'".format("Alexnet"))
    model = AlexNet(num_classes=num_classes)

    device = torch.device('cuda', local_rank)
    model = model.to(device)
    # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)    
    print('Model on cuda:%d' % local_rank)
    print('    Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    # 用horovod封装优化器
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
    # 广播参数
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    # Train and val
    for epoch in range(start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)
        train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda)
        test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda)
        print('Rank:{} Epoch[{}/{}]: LR: {:.3f}, Train loss: {:.5f}, Test loss: {:.5f}, Train acc: {:.2f}, Test acc: {:.2f}.'.format(local_rank,epoch+1, args.epochs, state['lr'], 
        train_loss, test_loss, train_acc, test_acc))
Example #2
0
def train(pertrained=False, resume_file=None):
    if pertrained:
        from model import alexnet
        net = alexnet(pretrained=True, num_classes=NUMBER_CLASSES)
    else:
        from model import AlexNet
        net = AlexNet(num_classes=NUMBER_CLASSES)
    valid_precision = 0
    policies = net.parameters()

    optimizer = optim.SGD(policies,
                          lr=LR,
                          momentum=MOMENTUM,
                          weight_decay=WEIGHT_DECAY)

    train_log = open(
        "logs/train_logs_{}.log".format(
            time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())), "w")
    valid_log = open(
        "logs/valid_logs_{}.log".format(
            time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())), "w")
    train_log.write("{}\t{}\t{}\n".format("epoch", "losses ", "correct"))
    valid_log.write("{}\t{}\t{}\n".format("epoch", "losses ", "correct"))
    # 恢复训练
    if resume_file:
        if os.path.isfile(resume_file):
            print(("=> loading checkpoint '{}'".format(resume_file)))
            checkpoint = torch.load(resume_file)
            start_epoch = checkpoint['epoch']
            net.load_state_dict(checkpoint['model_state_dict'])
            print(("=> loaded checkpoint '{}' (epoch {})".format(
                resume_file, checkpoint['epoch'])))
    else:
        start_epoch = 0
        print(("=> no checkpoint found at '{}'".format(resume_file)))

    # valid_precision = valid(net)
    for epoch in range(start_epoch, EPOCHES):
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        correct = AverageMeter()
        end = time.time()

        optimizer = adjust_learning_rate(optimizer, epoch, LR, LR_steps,
                                         WEIGHT_DECAY)

        for i_batch, sample_batched in enumerate(train_dataloader):
            # measure data loading time
            data_time.update(time.time() - end)
            inputs, labels = sample_batched
            if CUDA_AVALIABLE:
                outputs = net.forward(inputs.cuda())
                labels = labels.long().flatten().cuda()
            else:
                outputs = net.forward(inputs)
                labels = labels.long().flatten()

            outputs = outputs.reshape([-1, NUMBER_CLASSES])
            loss = criterion(outputs, labels)
            # 更新统计数据
            losses.update(loss.item(), inputs.size(0))
            _, predicted = torch.max(outputs.data, 1)
            # 计算准确率
            correct.update(
                (predicted == labels.long()).sum().item() / len(labels),
                inputs.size(0))

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if i_batch % 10 == 0:
                print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                       'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                       'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                       'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                       'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(
                           epoch,
                           i_batch,
                           len(train_dataloader),
                           batch_time=batch_time,
                           data_time=data_time,
                           loss=losses,
                           top1=correct,
                           lr=optimizer.param_groups[-1]['lr'])))

        train_log.write("{:5d}\t{:.5f}\t{:.5f}\n".format(
            epoch, losses.avg, correct.avg))
        train_log.flush()

        if epoch % 1 == 0:
            valid_precision = valid(net, epoch, valid_log)
        # 保存网络
        if (epoch > 0 and epoch % 10 == 0) or epoch == EPOCHES - 1:
            save_path = os.path.join(
                "models",
                "{:d}_{}_{:d}_{:d}_{:.5f}.pt".format(int(time.time()),
                                                     "alexnet", epoch,
                                                     BATCHSIZE,
                                                     valid_precision))
            print("[INFO] Save weights to " + save_path)
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': net.state_dict(),
                    'optimizer_state_dir': optimizer.state_dict,
                    'loss': loss
                }, save_path)

    train_log.close()
    valid_log.close()
Example #3
0
        loss = loss_function(outputs, labels)
        running_loss += loss
        loss.backward()
        optimizer.step()

        rate = index / train_data_loader.__len__()
        a = "*" * int(rate * 50)
        b = "." * int((1 - rate) * 50)
        print("\rtrain loss: {:^3.0f}%[{}->{}]{:.3f}".format(
            int(rate * 100), a, b, loss),
              end="")
    print("\n time.perf_counter()-t1")

    model.eval()
    acc = 0.0
    with torch.no_grad():
        for data in valid_data_loader:
            imgs, labels = data
            outputs = model(imgs)
            acc += (torch.max(outputs, dim=1)[1] == labels).sum().item()
        acc = acc / valid_data_loader.dataset.__len__()
        if acc > best_acc:
            best_acc = acc
            print("Saving Model")
            torch.save(model.state_dict(), 'AlexNet_weights.pth')
            torch.save(model, 'AlexNet.pth')
        print('[epoch %d] train_loss: %.3f  test_accuracy: %.3f' %
              (epoch + 1, running_loss, acc))

print('Finished Training')
Example #4
0
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #指定设备
    print("using {} device.".format(device))

    data_transform = { #数据预处理
        "train": transforms.Compose([transforms.RandomResizedCrop(224),# key 为trian 返回这些方法 随机裁剪 224*224
                                     transforms.RandomHorizontalFlip(),#随机反转
                                     transforms.ToTensor(),#转成
                                     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]),#标准化处理
        "val": transforms.Compose([transforms.Resize((224, 224)),  # cannot 224, must (224, 224)
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])}

    data_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))  # get data root path
    image_path = os.path.join(data_root, "data_set", "dog_data")  # flower data set path
    assert os.path.exists(image_path), "{} path does not exist.".format(image_path)
    train_dataset = datasets.ImageFolder(root=os.path.join(image_path, "train"),
                                         transform=data_transform["train"])#数据预处理
    train_num = len(train_dataset) #个数

    # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
    flower_list = train_dataset.class_to_idx #获取名称所对应索引
    cla_dict = dict((val, key) for key, val in flower_list.items()) #遍历 key value 对调
    # write dict into json file
    json_str = json.dumps(cla_dict, indent=4)
    with open('class_indices.json', 'w') as json_file:#生成json 便于打开
        json_file.write(json_str)

    batch_size = 32
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size, shuffle=True,
                                               num_workers=nw) #加载

    validate_dataset = datasets.ImageFolder(root=os.path.join(image_path, "val"),
                                            transform=data_transform["val"])
    val_num = len(validate_dataset)
    validate_loader = torch.utils.data.DataLoader(validate_dataset,
                                                  batch_size=4, shuffle=False,
                                                  num_workers=nw)

    print("using {} images for training, {} images fot validation.".format(train_num,
                                                                           val_num))
    # test_data_iter = iter(validate_loader)
    # test_image, test_label = test_data_iter.next()
    #
    # def imshow(img):
    #     img = img / 2 + 0.5  # unnormalize
    #     npimg = img.numpy()
    #     plt.imshow(np.transpose(npimg, (1, 2, 0)))
    #     plt.show()
    #
    # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4)))
    # imshow(utils.make_grid(test_image))

    net = AlexNet(num_classes=5, init_weights=True) #类别5

    net.to(device) #网络设备
    loss_function = nn.CrossEntropyLoss() #损失函数
    # pata = list(net.parameters())
    optimizer = optim.Adam(net.parameters(), lr=0.0002) #adam优化器 对象是网络中可训练参数 学习率 自己调参

    save_path = './AlexNet.pth' #保存模型路径
    best_acc = 0.0
    for epoch in range(10):#训练
        # train
        net.train() #管理神经元失活
        running_loss = 0.0 #统计平均损失
        t1 = time.perf_counter() #训练时间
        for step, data in enumerate(train_loader, start=0): #遍历数据集
            images, labels = data #分为图像 标签
            optimizer.zero_grad() #清空梯度信息
            outputs = net(images.to(device)) #正向传播 指定设备
            loss = loss_function(outputs, labels.to(device)) #损失
            loss.backward() #反向传播
            optimizer.step() #更新结点参数

            # print statistics
            running_loss += loss.item() #损失累加
            # print train process
            rate = (step + 1) / len(train_loader) #打印训练进度
            a = "*" * int(rate * 50)
            b = "." * int((1 - rate) * 50)
            print("\rtrain loss: {:^3.0f}%[{}->{}]{:.3f}".format(int(rate * 100), a, b, loss), end="")
        print()
        print(time.perf_counter()-t1)

        # validate
        net.eval() #关闭失活
        acc = 0.0  # accumulate accurate number / epoch
        with torch.no_grad():
            for val_data in validate_loader:
                val_images, val_labels = val_data
                outputs = net(val_images.to(device))
                predict_y = torch.max(outputs, dim=1)[1] #最大就是类别
                acc += (predict_y == val_labels.to(device)).sum().item() #预测与真实对比 累加
            val_accurate = acc / val_num #准确率
            if val_accurate > best_acc: #如果准确率大于历史最优
                best_acc = val_accurate #更新
                torch.save(net.state_dict(), save_path) #保存权重
            print('[epoch %d] train_loss: %.3f  test_accuracy: %.3f' % #打印信息
                  (epoch + 1, running_loss / step, val_accurate))

    print('Finished Training')
Example #5
0
def train(args):
    device = torch.device(f"cuda:{args.device_id}")
    model = AlexNet(n_cls=100, useLRN=args.useLRN, useDropOut=args.useDropOut)
    # model = AlexNet(num_classes= 100)
    criterion = nn.CrossEntropyLoss()

    model.to(device)
    optimizer = Adam(model.parameters(), lr=args.lr)

    train_loader, valid_loader = getLoaders(split="train",
                                            batch_size=args.batch_size,
                                            num_workers=args.num_workers,
                                            aug=args.useAug)

    train_loss_arr = []
    valid_loss_arr = []
    valid_acc_arr = []
    valid_top5_arr = []
    n_iter = 0
    best_loss = float('inf')
    best_top1_acc = 0
    best_top5_acc = 0
    for ep in range(args.epoch):
        model.train()
        for _, (img, label) in tqdm(enumerate(train_loader),
                                    total=len(train_loader)):
            img, label = img.to(device), label.to(device)
            optimizer.zero_grad()
            pred = model(img)
            loss = criterion(pred, label)
            # loss = model.criterion(pred, label)
            loss.backward()
            optimizer.step()
            train_loss_arr.append(loss.item())
            n_iter += 1
        model.eval()
        ep_valid_loss_arr = []
        ep_acc_arr = []
        ep_top5_arr = []
        with torch.no_grad():
            for _, (img, label) in tqdm(enumerate(valid_loader),
                                        total=len(valid_loader)):
                img, label = img.to(device), label.to(device)
                pred = model(img)
                loss = criterion(pred, label)
                # loss = model.criterion(pred, label)
                acc = utils.top_k_acc(k=1,
                                      pred=pred.detach().cpu().numpy(),
                                      label=label.detach().cpu().numpy())
                acc5 = utils.top_k_acc(k=5,
                                       pred=pred.detach().cpu().numpy(),
                                       label=label.detach().cpu().numpy())
                ep_acc_arr.append(acc)
                ep_top5_arr.append(acc5)
                ep_valid_loss_arr.append(loss.item())
        valid_loss = np.mean(ep_valid_loss_arr)
        valid_acc = np.mean(ep_acc_arr)
        valid_top5 = np.mean(ep_top5_arr)
        train_loss = np.mean(train_loss_arr[-len(train_loader):])
        valid_loss_arr.append(valid_loss)
        if valid_loss < best_loss:
            best_loss = valid_loss
            best_top1_acc = valid_acc
            best_top5_acc = valid_top5
            model.cpu()
            torch.save(model.state_dict(), "best_model.pth")
            model.to(device)
        if (ep + 1) % 10 == 0:
            model.cpu()
            torch.save(
                {
                    "model": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "train_loss": train_loss_arr,
                    "valid_loss": valid_loss_arr,
                    "valid_acc": valid_acc_arr,
                    "valid_top5": valid_top5_arr,
                    "best_loss": best_loss,
                    "ep": ep,
                    "n_iter": n_iter,
                }, "model_checkpoint.pth")
            model.to(device)
        print(
            f"[{ep}, {n_iter}] train: {train_loss:.4f}, valid: {valid_loss:.4f}, acc: {valid_acc:.4f}, top5: {valid_top5:.4f}"
        )
    with open("exp_result.txt", "a+") as f:
        f.write(
            f"{args}, loss: {best_loss:.4f}, top1: {best_top1_acc*100:.1f}, top5: {best_top5_acc*100:.1f}\n"
        )
Example #6
0
def main():
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("using {} device.".format(device))

    tbwriter = SummaryWriter(log_dir="./logs")

    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(360),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]),
        "val":
        transforms.Compose([
            transforms.Resize(360, 360),  # cannot 360, must (360,360)
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    }

    data_root = os.path.abspath(os.path.join(os.getcwd(),
                                             "./DATA"))  # get data root path
    image_path = os.path.join(data_root, "male")  # flower data set path
    assert os.path.exists(image_path), "{} path does not exist.".format(
        image_path)
    train_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "train"),
                                         transform=data_transform["train"])
    train_num = len(train_dataset)

    flower_list = train_dataset.class_to_idx
    cla_dict = dict((val, key) for key, val in flower_list.items())
    # write dict into json file
    json_str = json.dumps(cla_dict, indent=2)
    with open('class_indices.json', 'w') as json_file:
        json_file.write(json_str)

    batch_size = 8
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=nw)

    validate_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "val"),
                                            transform=data_transform["val"])
    val_num = len(validate_dataset)
    validate_loader = torch.utils.data.DataLoader(validate_dataset,
                                                  batch_size=8,
                                                  shuffle=True,
                                                  num_workers=nw)

    print("using {} images for training, {} images fot validation.".format(
        train_num, val_num))

    if os.path.exists("./log360.pth"):
        net = AlexNet()
        #net.load_state_dict(torch.load("./log360.pth", map_location='cuda:2'))
        net = torch.load("./log360.pth", 'cpu')
        print("continue training")
    else:
        net = AlexNet(num_classes=3, init_weights=True)
        net.to(device)
        print("start training anew")

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.0001)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.98)

    epochs = 2000
    save_path = './AlexNet.pth'
    best_acc = 0.0
    train_steps = len(train_loader)

    #json_path = './class_indices.json'
    #json_file = open(json_path, "r")
    #class_indict = json.load(json_file)
    #model = AlexNet(num_classed=6).to(device)

    trainLOSS = []  #save loss
    testLOSS = []  #save loss
    valACC = []  #save val acc

    for epoch in range(epochs):
        scheduler.step()
        print('LR:{}'.format(scheduler.get_lr()[0]))
        # train
        net.train()
        running_loss = 0.0
        train_bar = tqdm(train_loader)
        for step, data in enumerate(train_bar):
            images, labels = data
            optimizer.zero_grad()
            outputs = net(images.to(device))
            loss = loss_function(outputs, labels.to(device))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(
                epoch + 1, epochs, loss)

        # validate
        net.eval()
        acc = 0.0  # accumulate accurate number / epoch
        with torch.no_grad():
            val_bar = tqdm(validate_loader, colour='green')
            for val_data in val_bar:
                val_images, val_labels = val_data
                outputs = net(val_images.to(device))
                predict_y = torch.max(outputs, dim=1)[1]
                acc += torch.eq(predict_y, val_labels.to(device)).sum().item()

        val_accurate = acc / val_num

        tbwriter.add_scalar('train/loss', running_loss / train_steps, epoch)
        tbwriter.add_scalar('val/acc', val_accurate, epoch)

        trainLOSS.append(running_loss / train_steps)
        valACC.append(val_accurate)

        print('[epoch %d] train_loss: %.3f  val_accuracy: %.3f' %
              (epoch + 1, running_loss / train_steps, val_accurate))
        print(' ')

        if val_accurate > best_acc:
            best_acc = val_accurate
            torch.save(net.state_dict(), save_path)

        #predict
        #weights_path="./AlexNet.pth"
        #model.load_state_dict(torch.load(weights_path))

        #model.eval()
        #with torch.no_grad():
        #    putput = torch.squeeze(model(img.to(device))).cpu()
        #    predict = torch.softmax(output, dim=0)
        #    predict_cla = torch.argmax(predict.numpy)

    npLOSS = np.array(trainLOSS)
    npVALACC = np.array(valACC)
    np.save('./save/loss_epoch_{}'.format(epoch), npLOSS)
    np.save('./save/valacc_epoch_{}'.format(epoch), npVALACC)

    print('Finished Training')
Example #7
0
            if cuda:
                xv, yv = xv.cuda(), yv.cuda()
            v_feature, v_score, v_pred = model.forward(xv)
            v_pred_label = torch.max(v_score, 1)[1]
            v_equal = torch.eq(v_pred_label, yv).float()
            zeros = zeros.scatter_add(0, yv, v_equal)
            zeros_classes = zeros_classes.scatter_add(
                0, yv, torch.ones_like(yv, dtype=torch.float))
            v_correct += torch.sum(v_equal).item()
            v_sum += len(yv)
        v_acc = v_correct / v_sum
        output('validation: {}, {}'.format(v_correct, v_acc, zeros))
        output('class: {}'.format(zeros.tolist()))
        output('class: {}'.format(zeros_classes.tolist()))
        output('source: {}, target: {}, batch_size: {}, init_lr: {}'.format(
            s_name, t_name, batch_size, init_lr))
        output('lr_mult: {}, lr_mult_D: {}'.format(lr_mult, lr_mult_D))
        output('    =======    START TRAINING    =======    ')

    # save model
    if epoch % 1000 == 0 and epoch != 0:
        torch.save(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'opt': opt.state_dict(),
                'opt_D': opt_D.state_dict()
            }, checkpoint_save_path)

    epoch += 1
def main():
    # 设置运行设备
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("using {} device.".format(device))

    # 数据处理
    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]),
        "val":
        transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    }
    # 存放train与val的路径
    image_path = '/home/xulei/数据集大本营/5_flower_data/flower_data'  # flower data root path
    # 若该目录不存在,在报错并终止程序
    assert os.path.exists(image_path), "{} path does not exist.".format(
        image_path)
    # 定义训练数据集
    train_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "train"),
                                         transform=data_transform["train"])
    # 训练数据集的文件数量
    train_num = len(train_dataset)

    # flower_list: {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
    flower_list = train_dataset.class_to_idx
    # cla_dict : {0: 'daisy', 1: 'dandelion', 2: 'roses', 3: 'sunflowers', 4: 'tulips'}
    cla_dict = dict((val, key) for key, val in flower_list.items())
    # write dict into json file
    # 要输出json格式,需要对json数据进行编码,要用到函数:json.dumps
    # indent=4, 的作用是让字典的内容逐行显示,每个key占一行
    # json_str :
    # '{
    #     "0": "daisy",
    #     "1": "dandelion",
    #     "2": "roses",
    #     "3": "sunflowers",
    #     "4": "tulips"
    # }'
    json_str = json.dumps(cla_dict, indent=4)

    with open('class_idices.json', 'w') as json_file:
        json_file.write(json_str)

    batch_size = 128
    nw = min(os.cpu_count(), batch_size if batch_size > 1 else 0,
             8)  # number of workers nw: 8 ?????
    print("using {} dataloader workers every process".format(nw))
    train_loader = datas.DataLoader(train_dataset,
                                    batch_size,
                                    shuffle=True,
                                    num_workers=nw)
    validate_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "val"),
                                            transform=data_transform["val"])
    # val_num: 364
    val_num = len(validate_dataset)
    validate_loader = datas.DataLoader(validate_dataset,
                                       batch_size,
                                       shuffle=False,
                                       num_workers=nw)
    print("using {} images for trainning, {} images for validation.".format(
        train_num, val_num))

    net = AlexNet(num_classes=5).to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.00004)

    epoches = 20
    save_path = './AlexNet.pth'
    best_acc = 0.0
    # train_steps : 26 len(train_loader)= training_images_num/batch_size
    train_steps = len(train_loader)
    for epoch in range(epoches):

        net.train()
        running_loss = 0.0
        train_bar = tqdm(train_loader)  # 进度条
        for step, data in enumerate(train_bar):
            images, labels = data
            optimizer.zero_grad()
            outputs = net(images.to(device))
            loss = loss_function(outputs, labels.to(device))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(
                epoch + 1, epoches, loss)

        # validata
        net.eval()
        acc = 0.0  # accumulate accurate number / epoch
        with torch.no_grad():
            val_bar = tqdm(validate_loader)  # , colour='green'
            for val_data in val_bar:
                val_images, val_labels = val_data
                outputs = net(val_images.to(device))
                predict_y = torch.max(outputs, dim=1)[1]
                acc += torch.eq(predict_y, val_labels.to(device)).sum().item()
        val_accurate = acc / val_num
        print('\n[epoch %d] train_loss: %.3f val_accuracy: %.3f' %
              (epoch + 1, running_loss / train_steps, val_accurate))
        if val_accurate > best_acc:
            best_acc = val_accurate
            torch.save(net.state_dict(), save_path)
    print("Finshed Training")
train_loss = []
for epoch in range(num_epochs):
    for idx, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        if ((idx + 1) % 100 == 0):
            print("epoch is {}/{} Step is: {}/{} loss is: {}".format(
                epoch, num_epochs, idx, num_batches, loss.item()))

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for idx, (inputs, labels) in enumerate(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        preds = model(inputs)
        values, indices = torch.max(preds, 1)
        total += labels.shape[0]
        correct += (labels == indices).sum().item()
    print("Accuracy of the network is: {}%".format(100 * correct / total))

torch.save(model.state_dict(), 'model.pth')
    print()
    with open(os.path.join("train.log"), "a") as log:
        log.write(str('%f s' % (time.perf_counter() - time_start)) + "\n")
    print('%f s' % (time.perf_counter() - time_start))

    ########################################### validate ###########################################
    net.eval()  # 验证过程中关闭 Dropout
    acc = 0.0
    with torch.no_grad():
        for val_data in validate_loader:
            val_images, val_labels = val_data
            outputs = net(val_images.to(device))
            predict_y = torch.max(outputs,
                                  dim=1)[1]  # 以output中值最大位置对应的索引(标签)作为预测输出
            acc += (predict_y == val_labels.to(device)).sum().item()
        val_accurate = acc / val_num

        # 保存准确率最高的那次网络参数
        if val_accurate > best_acc:
            best_acc = val_accurate
            torch.save(net.state_dict(), save_path)
        with open(os.path.join("train.log"), "a") as log:
            log.write(
                str('[epoch %d] train_loss: %.3f  test_accuracy: %.3f \n' %
                    (epoch + 1, running_loss / step, val_accurate)) + "\n")
        print('[epoch %d] train_loss: %.3f  test_accuracy: %.3f \n' %
              (epoch + 1, running_loss / step, val_accurate))
with open(os.path.join("train.log"), "a") as log:
    log.write(str('Finished Training') + "\n")
print('Finished Training')
Example #11
0
class Solver(object):
    def __init__(self, config):
        self.model = None
        self.lr = config.lr
        self.epochs = config.epoch
        self.train_batch_size = config.trainBatchSize
        self.test_batch_size = config.testBatchSize
        self.criterion = None
        self.optimizer = None
        self.scheduler = None
        self.device = None
        self.cuda = config.cuda
        self.train_loader = None
        self.test_loader = None
        self.is_board = False

    def load_data(self):
        train_transform = transforms.Compose(
            [transforms.RandomHorizontalFlip(),
             transforms.ToTensor()])
        test_transform = transforms.Compose([transforms.ToTensor()])
        train_set = torchvision.datasets.CIFAR10(
            root='/mnt/disk50/datasets/cifar',
            train=True,
            download=True,
            transform=train_transform)
        self.train_loader = torch.utils.data.DataLoader(
            dataset=train_set, batch_size=self.train_batch_size, shuffle=True)
        test_set = torchvision.datasets.CIFAR10(
            root='/mnt/disk50/datasets/cifar',
            train=False,
            download=True,
            transform=test_transform)
        self.test_loader = torch.utils.data.DataLoader(
            dataset=test_set, batch_size=self.test_batch_size, shuffle=False)

    def load_model_from_pth(self, model_path):
        """Load the pre-trained model weight

        :param model_path:
        :return:
        """
        checkpoint = torch.load(model_path,
                                map_location=self.device_name)['model']

        # TODO:这里需要具体了解原因在哪里?
        checkpoint_parameter_name = list(checkpoint.keys())[0]
        model_parameter_name = next(self.model.named_parameters())[0]

        is_checkpoint = checkpoint_parameter_name.startswith('module.')
        is_model = model_parameter_name.startswith('module.')

        if is_checkpoint and not is_model:
            # 移除checkpoint模型里面参数
            new_parameter_check = OrderedDict()
            for key, value in checkpoint.items():
                if key.startswith('module.'):
                    new_parameter_check[key[7:]] = value
            self.model.load_state_dict(new_parameter_check)
        elif not is_checkpoint and is_model:
            # 添加module.参数
            new_parameter_dict = OrderedDict()
            for key, value in checkpoint.items():
                if not key.startswith('module.'):
                    key = 'module.' + key
                    new_parameter_dict[key] = value
        else:
            self.model.load_state_dict(checkpoint)
        return self.model

    def load_model(self):
        if self.cuda:
            self.device = torch.device('cuda:0')
            cudnn.benchmark = True
        else:
            self.device = torch.device('cpu')

        # self.model = LeNet().to(self.device)
        self.model = AlexNet().to(self.device)

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.scheduler = optim.lr_scheduler.MultiStepLR(self.optimizer,
                                                        milestones=[75, 150],
                                                        gamma=0.5)
        self.criterion = nn.CrossEntropyLoss().to(self.device)

    def train(self, writer=None):
        print("train:")
        self.model.train()
        train_loss = 0
        train_correct = 0
        total = 0

        for batch_num, (data, target) in enumerate(self.train_loader):
            data, target = data.to(self.device), target.to(self.device)
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.criterion(output, target)
            loss.backward()
            self.optimizer.step()
            train_loss += loss.item()
            prediction = torch.max(
                output,
                1)  # second param "1" represents the dimension to be reduced
            total += target.size(0)

            # train_correct incremented by one if predicted right
            train_correct += np.sum(
                prediction[1].cpu().numpy() == target.cpu().numpy())

            progress_bar(
                batch_num, len(self.train_loader),
                'Loss: %.4f | Acc: %.3f%% (%d/%d)' %
                (train_loss / (batch_num + 1), 100. * train_correct / total,
                 train_correct, total))

        # if not writer:
        #     writer.add_scalar

        return train_loss, train_correct / total

    def test(self):
        print("test:")
        self.model.eval()
        test_loss = 0
        test_correct = 0
        total = 0
        start = time.time()
        with torch.no_grad():
            for batch_num, (data, target) in enumerate(self.test_loader):
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                loss = self.criterion(output, target)
                test_loss += loss.item()
                prediction = torch.max(output, 1)
                total += target.size(0)
                test_correct += np.sum(
                    prediction[1].cpu().numpy() == target.cpu().numpy())

                progress_bar(
                    batch_num, len(self.test_loader),
                    'Loss: %.4f | Acc: %.3f%% (%d/%d)' %
                    (test_loss / (batch_num + 1), 100. * test_correct / total,
                     test_correct, total))
        end = time.time()
        time_used = end - start

        return test_loss, test_correct / total, time_used

    def save(self):
        model_out_path = "./best_model_new.pkl"
        torch.save(self.model.state_dict(), model_out_path)
        print("Checkpoint saved to {}".format(model_out_path))

    def run(self):
        self.load_data()
        self.load_model()
        # for k, v in self.model.state_dict():
        #     print('layer{}'.k)
        #     print(v)
        accuracy = 0
        writer = SummaryWriter()
        for epoch in range(1, self.epochs + 1):
            self.scheduler.step(epoch)
            print("\n===> epoch: %d/200" % epoch)

            train_loss, train_acc = self.train()
            test_loss, test_acc = self.test()
            # writer.add_scalar('loss_group',{'train_loss':train_loss.numpy(),
            #                                 'test_loss':test_loss.numpy()},epoch)
            # writer.add_scalar('acc_group',{'train_acc':train_acc.numpy(),
            #                                'test_acc':test_acc.numpy()}, epoch)

            if test_acc > accuracy:
                accuracy = test_acc
                self.save()
            elif epoch == self.epochs:
                print("===> BEST ACC. PERFORMANCE: %.3f%%" % (accuracy * 100))
                self.save()
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("using {} device.".format(device))

    batch_size = 16
    epochs = 20

    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        "val":
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    }

    data_root = os.path.abspath(os.path.join(os.getcwd(),
                                             "."))  # get data root path
    image_path = os.path.join(data_root, "data_set",
                              "flower_data")  # flower data set path
    assert os.path.exists(image_path), "{} path does not exist.".format(
        image_path)
    train_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "train"),
                                         transform=data_transform["train"])
    train_num = len(train_dataset)

    # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
    flower_list = train_dataset.class_to_idx
    cla_dict = dict((val, key) for key, val in flower_list.items())
    # write dict into json file
    json_str = json.dumps(cla_dict, indent=4)
    with open('class_indices.json', 'w') as json_file:
        json_file.write(json_str)

    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=nw)

    validate_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "val"),
                                            transform=data_transform["val"])
    val_num = len(validate_dataset)
    validate_loader = torch.utils.data.DataLoader(validate_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False,
                                                  num_workers=nw)

    print("using {} images for training, {} images for validation.".format(
        train_num, val_num))

    # create model
    net = AlexNet(num_classes=5)

    net.to(device)

    # define loss function
    loss_function = nn.CrossEntropyLoss()

    # construct an optimizer
    params = [p for p in net.parameters() if p.requires_grad]
    optimizer = optim.Adam(params, lr=0.0001)

    best_acc = 0.0
    save_path = 'weights/alexnet.pth'
    train_steps = len(train_loader)
    for epoch in range(epochs):
        # train
        net.train()
        running_loss = 0.0
        train_bar = tqdm(train_loader)
        for step, data in enumerate(train_bar):
            images, labels = data
            optimizer.zero_grad()
            logits = net(images.to(device))
            loss = loss_function(logits, labels.to(device))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(
                epoch + 1, epochs, loss)

        # validate
        net.eval()
        acc = 0.0  # accumulate accurate number / epoch
        with torch.no_grad():
            val_bar = tqdm(validate_loader)
            for val_data in val_bar:
                val_images, val_labels = val_data
                outputs = net(val_images.to(device))
                # loss = loss_function(outputs, test_labels)
                predict_y = torch.max(outputs, dim=1)[1]
                acc += torch.eq(predict_y, val_labels.to(device)).sum().item()

                val_bar.desc = "valid epoch[{}/{}]".format(epoch + 1, epochs)
        val_accurate = acc / val_num
        print('[epoch %d] train_loss: %.3f  val_accuracy: %.3f' %
              (epoch + 1, running_loss / train_steps, val_accurate))

        if val_accurate > best_acc:
            best_acc = val_accurate
            torch.save(net.state_dict(), save_path)

    print('Finished Training')
Example #13
0
File: main.py Project: yldang/MLPJ

if __name__ == "__main__":
    # init seed value
    seed = torch.initial_seed()

    # TensorboardX
    tbwriter = SummaryWriter(log_dir=LOG_DIR)
    print("TensorboardX summary writer created")

    # create model
    alexnet = AlexNet(num_classes=NUM_CLASSES)

    # load pretrained model
    if pretrained:
        alexnet_dict = alexnet.state_dict()
        # print(alexnet_dict.keys())
        alexnet_pretrained = models.alexnet(pretrained=True)
        pretrained_dict = alexnet_pretrained.state_dict()
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in alexnet_dict}
        # print(pretrained_dict.keys())
        pretrained_dict.pop("classifier.6.weight")
        pretrained_dict.pop("classifier.6.bias")
        alexnet_dict.update(pretrained_dict)
        alexnet.load_state_dict(alexnet_dict)
        # print(alexnet_dict.keys())
        print("Load from pretrained")

    # Freeze parameter
    if freeze_layer:
        for name, value in alexnet.named_parameters():
Example #14
0
def train():
    try:
        os.makedirs(opt.checkpoints_dir)
    except OSError:
        pass
    if torch.cuda.device_count() > 1:
        model = torch.nn.parallel.DataParallel(
            AlexNet(num_classes=opt.num_classes))
    else:
        model = AlexNet(num_classes=opt.num_classes)
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(
            torch.load(MODEL_PATH, map_location=lambda storage, loc: storage))
    model.to(device)
    ################################################
    # Set loss function and Adam optimizer
    ################################################
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=opt.lr)

    for epoch in range(opt.epochs):
        # train for one epoch
        print(f"\nBegin Training Epoch {epoch + 1}")
        # Calculate and return the top-k accuracy of the model
        # so that we can track the learning process.
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()

        for i, data in enumerate(train_dataloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, targets = data
            inputs = inputs.to(device)
            targets = targets.to(device)

            # compute output
            output = model(inputs)
            loss = criterion(output, targets)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, targets, topk=(1, 2))
            losses.update(loss.item(), inputs.size(0))
            top1.update(prec1, inputs.size(0))
            top5.update(prec5, inputs.size(0))

            # compute gradients in a backward pass
            optimizer.zero_grad()
            loss.backward()

            # Call step of optimizer to update model params
            optimizer.step()

            print(
                f"Epoch [{epoch + 1}] [{i + 1}/{len(train_dataloader)}]\t"
                f"Loss {loss.item():.4f}\t"
                f"Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t"
                f"Prec@5 {top5.val:.3f} ({top5.avg:.3f})",
                end="\r")

        # save model file
        torch.save(model.state_dict(), MODEL_PATH)
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)

    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]),
        "val":
        transforms.Compose([
            transforms.Resize((224, 224)),  # cannot 224, must (224, 224)
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    }

    data_root = os.path.abspath(os.path.join(os.getcwd(), "./"))
    image_path = os.path.join(data_root, "flower_data")
    train_dataset = datasets.ImageFolder(root=image_path + "/train",
                                         transform=data_transform['train'])
    train_num = len(train_dataset)

    # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
    flower_list = train_dataset.class_to_idx
    cla_dict = dict((val, key) for key, val in flower_list.items())
    # write dict into json file
    json_str = json.dumps(cla_dict, indent=4)
    with open('class_indices.json', 'w') as json_file:
        json_file.write(json_str)

    batch_size = 8
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0)

    validate_dataset = datasets.ImageFolder(root=image_path + "/val",
                                            transform=data_transform["val"])
    val_num = len(validate_dataset)
    validate_loader = torch.utils.data.DataLoader(validate_dataset,
                                                  batch_size=4,
                                                  shuffle=True,
                                                  num_workers=0)

    # test_data_iter = iter(validate_loader)
    # test_image, test_label = test_data_iter.next()
    #
    # def imshow(img):
    #     img = img / 2 + 0.5  # unnormalize
    #     npimg = img.numpy()
    #     plt.imshow(np.transpose(npimg, (1, 2, 0)))
    #     plt.show()
    #
    # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4)))
    # imshow(utils.make_grid(test_image))

    net = AlexNet(num_class=5)
    print(net)
    net.to(device)
    loss_function = nn.CrossEntropyLoss()
    # pata = list(net.parameters())
    optimizer = optim.Adam(net.parameters(), lr=0.0002)

    save_path = './AlexNet.pth'
    best_acc = 0.0
    for epoch in range(10):
        # train
        net.train()
        running_loss = 0.0
        t1 = time.perf_counter()
        for step, data in enumerate(train_loader, start=0):
            images, labels = data
            optimizer.zero_grad()
            outputs = net(images.to(device))
            loss = loss_function(outputs, labels.to(device))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            # print train process
            rate = (step + 1) / len(train_loader)
            a = "*" * int(rate * 50)
            b = "." * int((1 - rate) * 50)
            print("\rtrain loss: {:^3.0f}%[{}->{}]{:.3f}".format(
                int(rate * 100), a, b, loss),
                  end="")
        print()
        print(time.perf_counter() - t1)

        # validate
        net.eval()
        acc = 0.0  # accumulate accurate number / epoch

        # 验证过程中不计算损失梯度
        with torch.no_grad():
            for val_data in validate_loader:
                val_images, val_labels = val_data
                outputs = net(val_images.to(device))
                predict_y = torch.max(outputs, dim=1)[1]
                acc += (predict_y == val_labels.to(device)).sum().item()
            val_accurate = acc / val_num
            if val_accurate > best_acc:
                best_acc = val_accurate
                torch.save(net.state_dict(), save_path)
            print('[epoch %d] train_loss: %.3f  test_accuracy: %.3f' %
                  (epoch + 1, running_loss / step, val_accurate))

    print('Finished Training')
Example #16
0
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("using {} device.".format(device))

    data_transform = {
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]),
        "val":
        transforms.Compose([
            transforms.Resize((224, 224)),  # cannot 224, must (224, 224)
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    }

    data_root = os.path.abspath(os.path.join(os.getcwd(),
                                             "../.."))  # get data root path
    image_path = os.path.join(data_root, "data_set",
                              "flower_data")  # flower data set path
    assert os.path.exists(image_path), "{} path does not exist.".format(
        image_path)
    train_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "train"),
                                         transform=data_transform["train"])
    train_num = len(train_dataset)

    # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
    flower_list = train_dataset.class_to_idx
    cla_dict = dict((val, key) for key, val in flower_list.items())
    # write dict into json file
    json_str = json.dumps(cla_dict, indent=4)
    with open('class_indices.json', 'w') as json_file:
        json_file.write(json_str)

    batch_size = 32
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=nw)

    validate_dataset = datasets.ImageFolder(root=os.path.join(
        image_path, "val"),
                                            transform=data_transform["val"])
    val_num = len(validate_dataset)
    validate_loader = torch.utils.data.DataLoader(validate_dataset,
                                                  batch_size=4,
                                                  shuffle=False,
                                                  num_workers=nw)

    print("using {} images for training, {} images for validation.".format(
        train_num, val_num))
    # test_data_iter = iter(validate_loader)
    # test_image, test_label = test_data_iter.next()
    #
    # def imshow(img):
    #     img = img / 2 + 0.5  # unnormalize
    #     npimg = img.numpy()
    #     plt.imshow(np.transpose(npimg, (1, 2, 0)))
    #     plt.show()
    #
    # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4)))
    # imshow(utils.make_grid(test_image))

    net = AlexNet(num_classes=5, init_weights=True)

    net.to(device)
    loss_function = nn.CrossEntropyLoss()
    # pata = list(net.parameters())
    optimizer = optim.Adam(net.parameters(), lr=0.0002)

    epochs = 10
    save_path = './AlexNet.pth'
    best_acc = 0.0
    train_steps = len(train_loader)
    for epoch in range(epochs):
        # train
        net.train()
        running_loss = 0.0
        train_bar = tqdm(train_loader)
        for step, data in enumerate(train_bar):
            images, labels = data
            optimizer.zero_grad()
            outputs = net(images.to(device))
            loss = loss_function(outputs, labels.to(device))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(
                epoch + 1, epochs, loss)

        # validate
        net.eval()
        acc = 0.0  # accumulate accurate number / epoch
        with torch.no_grad():
            val_bar = tqdm(validate_loader)
            for val_data in val_bar:
                val_images, val_labels = val_data
                outputs = net(val_images.to(device))
                predict_y = torch.max(outputs, dim=1)[1]
                acc += torch.eq(predict_y, val_labels.to(device)).sum().item()

        val_accurate = acc / val_num
        print('[epoch %d] train_loss: %.3f  val_accuracy: %.3f' %
              (epoch + 1, running_loss / train_steps, val_accurate))

        if val_accurate > best_acc:
            best_acc = val_accurate
            torch.save(net.state_dict(), save_path)

    print('Finished Training')
Example #17
0
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else
                          "cpu")  # torch.device规定训练中所使用的设备
    print("using {} device.".format(device))

    data_transform = {  # data_transform数据预处理    
        "train":
        transforms.Compose([
            transforms.RandomResizedCrop(224),  # 随机裁剪为224*224
            transforms.RandomHorizontalFlip(),  # 水平方向随机翻转
            transforms.ToTensor(),  # 转化为tensor     
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]),  # 标准化处理
        "val":
        transforms.Compose([
            transforms.Resize((224, 224)),  # * cannot 224, must (224, 224)
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    }
    print(os.getcwd())
    # data_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))  # get data root path
    # 先获取数据集所在的根目录os.getcwd()
    # ^ os.getcwd() 返回当前进程的工作目录,并非当前文件所在的目录
    # "../.."表示的是上两层目录,这个要看具体的情况,这是一个相对路径的写法
    # ^ os.path.join 路径拼接,拼接后得到的就是当前目录的上两级目录
    # ^ os.path.abspath() 获取指定文件或目录的绝对路径(完整路径)

    data_root = os.path.abspath(os.getcwd())

    image_path = os.path.join(data_root, "data_set",
                              "flower_data")  # flower data set path
    # 等价于 image_path = data_root + "data_set/flower_data"
    # assert os.path.exists(image_path), "{} path does not exist.".format(image_path)

    train_dataset = datasets.ImageFolder(
        root=os.path.join(image_path, "train"),  # 下载数据集 ,"train"表示是训练集数据   
        transform=data_transform["train"])  # 使用"train"的预处理方式
    train_num = len(train_dataset)  # 查看训练集有多少张图片

    # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
    flower_list = train_dataset.class_to_idx  # * .class_to_idx 得到分类名称对应的索引
    cla_dict = dict(
        (val, key) for key, val in flower_list.items())  # * 将刚刚字典的键值对 变为 值键对
    # write dict into json file
    json_str = json.dumps(cla_dict, indent=4)  # 将刚刚的字典变为json形式
    with open('class_indices.json', 'w') as json_file:
        json_file.write(json_str)

    batch_size = 32
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    print('Using {} dataloader workers every process'.format(nw))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,  # 加载数据集
        batch_size=batch_size,
        shuffle=True,  # 通过batchsize和随机参数从样本中获取一批批数据
        num_workers=nw)  # wins下num_workers一般设置为0,linux下num_workers设置可以分布式计算

    validate_dataset = datasets.ImageFolder(
        root=os.path.join(
            image_path, "val"
        ),  # root=os.path.join(image_path, "val")等价于 root=image_path+"val"
        transform=data_transform["val"])
    val_num = len(validate_dataset)
    validate_loader = torch.utils.data.DataLoader(
        validate_dataset,
        batch_size=batch_size,
        shuffle=False,  #   batch_size=4, shuffle=True,    
        num_workers=nw)

    print("using {} images for training, {} images for validation.".format(
        train_num, val_num))

    # 下面是查看数据集的demo
    # 注意,第60行的batch_size=4, shuffle=True再查看:

    # test_data_iter = iter(validate_loader)
    # test_image, test_label = test_data_iter.next()

    # def imshow(img):
    #     img = img / 2 + 0.5  # unnormalize
    #     npimg = img.numpy()
    #     plt.imshow(np.transpose(npimg, (1, 2, 0)))
    #     plt.show()

    # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4)))
    # imshow(utils.make_grid(test_image))

    net = AlexNet(num_classes=5, init_weights=True)  # 5个类别的花数据集,初始化权重为True
    # 实例化模型对象 net

    net.to(device)  # ^ net.to(device)将网络放入刚刚指定的设备中
    loss_function = nn.CrossEntropyLoss()  # 定义损失函数,多类别的交叉熵函数
    # pata = list(net.parameters())                             # 调试所用,查看模型的参数
    optimizer = optim.Adam(
        net.parameters(),
        lr=0.0002)  # 定义Adam优化器,优化对象是网络中所有的可训练参数net.parameters(),以及学习了lr=0.0002

    epochs = 10
    save_path = './AlexNet.pth'  # 保存权重的路径
    best_acc = 0.0  # 最佳准确率 best_acc,首先初始化为0,后面再更新
    train_steps = len(train_loader)

    for epoch in range(epochs):  # 迭代10次
        # * 因为使用了dropout,只在训练中使用,预测中不使用

        # train                                                 #  & 训练阶段
        net.train()  # 调用net.train()进入训练阶段,同时使用 dropout 方法
        running_loss = 0.0  # 统计训练中的平均损失
        train_bar = tqdm(train_loader)  # 为了统计训练一个epoch所需时间
        for step, data in enumerate(train_bar):  # 遍历数据集;数据集分为图像和标签
            images, labels = data
            optimizer.zero_grad()  # 梯度清0
            outputs = net(
                images.to(device))  # 正向传播,图像放入设备中,然后实例化AlexNet的网络net中
            loss = loss_function(
                outputs, labels.to(device))  # 计算损失,计算预测值与真实值的损失,这里label也要放入设备中
            loss.backward()  # 反向传播到每一个节点
            optimizer.step()  # 更新每一个节点的参数

            # print statistics
            running_loss += loss.item()  # 累加loss值

            train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(
                epoch + 1, epochs, loss)  # 为了或者训练进度

        # validate                                                  # & 测试阶段
        net.eval()  # 调用net.eval() 进入测试阶段,同时关闭 dropout 方法
        acc = 0.0  # accumulate accurate number / epoch
        with torch.no_grad():  # * with torch.no_grad() 禁止参数跟踪:验证中不计算损失梯度
            val_bar = tqdm(validate_loader)
            for val_data in val_bar:
                val_images, val_labels = val_data  # 数据划分为图片和对应的标签
                outputs = net(
                    val_images.to(device))  # 放入网络net中得到输出,输出的维度是 [batch, 10]
                predict_y = torch.max(
                    outputs, dim=1
                )[1]  # 求出输出的第1个维度(dim=1类别维度)max(只关注最大值对应的位置[1],不关心数值  ),得到预测值 predict_y
                acc += torch.eq(predict_y, val_labels.to(
                    device)).sum().item()  # 统计预测正确的个数   # ^ 通过.item()得到相应的数值
                # acc += (predict_y == val_labels.to(device)).sum().item()      # 等价的

        val_accurate = acc / val_num  # 累加的准确率除以样本个数,得到平均准确率
        print('[epoch %d] train_loss: %.3f  val_accuracy: %.3f' %
              (epoch + 1, running_loss / train_steps, val_accurate))

        if val_accurate > best_acc:  # 如果当前准确率大于历史最优准确率
            best_acc = val_accurate  # 更新
            torch.save(net.state_dict(), save_path)

    print('Finished Training')