def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(args) print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/') tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(args.data_path) img_size = {"B0": 224, "B1": 240, "B2": 260, "B3": 300, "B4": 380, "B5": 456, "B6": 528, "B7": 600} num_model = "B0" data_transform = { "train": transforms.Compose([transforms.RandomResizedCrop(img_size[num_model]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]), "val": transforms.Compose([transforms.Resize(img_size[num_model]), transforms.CenterCrop(img_size[num_model]), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])} # 实例化训练数据集 train_data_set = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_data_set = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_loader = torch.utils.data.DataLoader(val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_data_set.collate_fn)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("using {} device.".format(device)) train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(root) data_transform = { "train": transforms.Compose([transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]), "val": transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])} train_data_set = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers'.format(nw)) train_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # plot_data_loader_image(train_loader) for step, data in enumerate(train_loader): images, labels = data
def extract_logits(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model model_weight = torch.load(FLAGS['pretrain_model']) # 78.771% model = resnet50(model_weight, num_classes=FLAGS['class_num']).cuda() model.eval() summary(model, torch.zeros((1, 3, 224, 224)).cuda()) # test_accuracy(model) cudnn.benchmark = True # Data loading code from lmdb data_list = np.load( os.path.join(FLAGS['6x_larger_dataset'], 'data_list.npy')) np.save('data_list.npy', data_list) data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(224), # transforms.Resize(256), # transforms.RandomCrop((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } print("| Preparing model...") dsets = {} dsets['val'] = MyDataSet(data_transforms['val']) val_loader = torch.utils.data.DataLoader(dsets['val'], batch_size=4 * FLAGS['batch_size'], shuffle=False, num_workers=8, pin_memory=True) print('data_loader_success!') validate(val_loader, model)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(f"using device: {device}") _, _, val_images_path, val_images_label = read_split_data(args.data_path) img_size = 384 data_transform = { "val": transforms.Compose([transforms.Resize(int(img_size * 1.143)), transforms.CenterCrop(img_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])} # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) nw = min([os.cpu_count(), args.batch_size if args.batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=args.num_classes) # load pretrain weights assert os.path.exists(args.weights), "cannot find {} file".format(args.weights) model.load_state_dict(torch.load(args.weights, map_location=device)) model.to(device) # read class_indict json_label_path = './class_indices.json' assert os.path.exists(json_label_path), "cannot find {} file".format(json_label_path) json_file = open(json_label_path, 'r') class_indict = json.load(json_file) labels = [label for _, label in class_indict.items()] confusion = ConfusionMatrix(num_classes=args.num_classes, labels=labels) model.eval() with torch.no_grad(): for val_data in tqdm(val_loader, file=sys.stdout): val_images, val_labels = val_data outputs = model(val_images.to(device)) outputs = torch.softmax(outputs, dim=1) outputs = torch.argmax(outputs, dim=1) confusion.update(outputs.to("cpu").numpy(), val_labels.to("cpu").numpy()) confusion.plot() confusion.summary()
def main(): global args, best_prec1 args = parser.parse_args() # create model model_weight = torch.load(FLAGS['pretrain_model']) # 78.77% model = MobileNetV2(model_weight, num_classes=FLAGS['class_num']) model = model.cuda() cudnn.benchmark = True test_accuracy(model, FLAGS) # Data loading code from lmdb data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(224), # transforms.Resize(256), # transforms.RandomCrop((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } print("| Preparing model...") dset_train = MyDataSet(data_transforms['val'], FLAGS['proxy_dataset_size']) train_loader = torch.utils.data.DataLoader( dset_train, batch_size=FLAGS['proxy_dataset_size'], shuffle=False, num_workers=8, pin_memory=True) print('data_loader_success!') # evaluate and train validate(train_loader, model)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(args) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) img_size = { "B0": 224, "B1": 240, "B2": 260, "B3": 300, "B4": 380, "B5": 456, "B6": 528, "B7": 600 } num_model = "B0" data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(img_size[num_model]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(img_size[num_model]), transforms.CenterCrop(img_size[num_model]), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_data_set = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_data_set = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_data_set.collate_fn) # 如果存在预训练权重则载入 model = create_model(num_classes=args.num_classes).to(device) if os.path.exists(args.weights): weights_dict = torch.load(args.weights, map_location=device) load_weights_dict = { k: v for k, v in weights_dict.items() if model.state_dict()[k].numel() == v.numel() } print(model.load_state_dict(load_weights_dict, strict=False)) # 是否冻结权重 if args.freeze_layers: for name, para in model.named_parameters(): # 除最后一个卷积层和全连接层外,其他权重全部冻结 if ("features.top" not in name) and ("classifier" not in name): para.requires_grad_(False) else: print("training {}".format(name)) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=1E-4) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate sum_num = evaluate(model=model, data_loader=val_loader, device=device) acc = sum_num / len(val_data_set) print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(args) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } data_root = args.data_path json_path = "./classes_name.json" # 实例化训练数据集 train_dataset = MyDataSet(root_dir=data_root, csv_name="new_train.csv", json_path=json_path, transform=data_transform["train"]) # check num_classes if args.num_classes != len(train_dataset.labels): raise ValueError("dataset have {} classes, but input {}".format( len(train_dataset.labels), args.num_classes)) # 实例化验证数据集 val_dataset = MyDataSet(root_dir=data_root, csv_name="new_val.csv", json_path=json_path, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # create model model = shufflenet_v2_x1_0(num_classes=args.num_classes).to(device) # 如果存在预训练权重则载入 # if os.path.exists(args.weights): # weights_dict = torch.load(args.weights, map_location=device) # load_weights_dict = {k: v for k, v in weights_dict.items() # if model.state_dict()[k].numel() == v.numel()} # model.load_state_dict(load_weights_dict, strict=False) # 是否冻结权重 # if args.freeze_layers: # for name, para in model.named_parameters(): # # 除最后的全连接层外,其他权重全部冻结 # if "fc" not in name: # para.requires_grad_(False) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=4E-5) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate acc = evaluate(model=model, data_loader=val_loader, device=device) print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main_fun(rank, world_size, args): if torch.cuda.is_available() is False: raise EnvironmentError("not find GPU device for training.") # 初始化各进程环境 start os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" args.rank = rank args.world_size = world_size args.gpu = rank args.distributed = True torch.cuda.set_device(args.gpu) args.dist_backend = 'nccl' print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) dist.barrier() # 初始化各进程环境 end rank = args.rank device = torch.device(args.device) batch_size = args.batch_size num_classes = args.num_classes weights_path = args.weights args.lr *= args.world_size # 学习率要根据并行GPU的数量进行倍增 if rank == 0: # 在第一个进程中打印信息,并实例化tensorboard print(args) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_data_set = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_data_set = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) # 给每个rank对应的进程分配训练的样本索引 train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) val_sampler = torch.utils.data.distributed.DistributedSampler(val_data_set) # 将样本索引每batch_size个元素组成一个list train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, batch_size, drop_last=True) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers if rank == 0: print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, sampler=val_sampler, pin_memory=True, num_workers=nw, collate_fn=val_data_set.collate_fn) # 实例化模型 model = resnet34(num_classes=num_classes).to(device) # 如果存在预训练权重则载入 if os.path.exists(weights_path): weights_dict = torch.load(weights_path, map_location=device) load_weights_dict = { k: v for k, v in weights_dict.items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(load_weights_dict, strict=False) else: checkpoint_path = os.path.join(tempfile.gettempdir(), "initial_weights.pt") # 如果不存在预训练权重,需要将第一个进程中的权重保存,然后其他进程载入,保持初始化权重一致 if rank == 0: torch.save(model.state_dict(), checkpoint_path) dist.barrier() # 这里注意,一定要指定map_location参数,否则会导致第一块GPU占用更多资源 model.load_state_dict(torch.load(checkpoint_path, map_location=device)) # 是否冻结权重 if args.freeze_layers: for name, para in model.named_parameters(): # 除最后的全连接层外,其他权重全部冻结 if "fc" not in name: para.requires_grad_(False) else: # 只有训练带有BN结构的网络时使用SyncBatchNorm采用意义 if args.syncBN: # 使用SyncBatchNorm后训练会更耗时 model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to( device) # 转为DDP模型 model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # optimizer pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=0.005) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): train_sampler.set_epoch(epoch) mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() sum_num = evaluate(model=model, data_loader=val_loader, device=device) acc = sum_num / val_sampler.total_size if rank == 0: print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch) torch.save(model.module.state_dict(), "./weights/model-{}.pth".format(epoch)) # 删除临时缓存文件 if rank == 0: if os.path.exists(checkpoint_path) is True: os.remove(checkpoint_path) cleanup()
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(args) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) # 实例化SummaryWriter对象 tb_writer = SummaryWriter(log_dir="runs/flower_experiment") if os.path.exists("./weights") is False: os.makedirs("./weights") # 划分数据为训练集和验证集 train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) # 定义训练以及预测时的预处理方法 data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_data_set = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_data_set = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size # 计算使用num_workers的数量 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_data_set.collate_fn) # 实例化模型 model = resnet34(num_classes=args.num_classes).to(device) # 将模型写入tensorboard init_img = torch.zeros((1, 3, 224, 224), device=device) tb_writer.add_graph(model, init_img) # 如果存在预训练权重则载入 if os.path.exists(args.weights): weights_dict = torch.load(args.weights, map_location=device) load_weights_dict = { k: v for k, v in weights_dict.items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(load_weights_dict, strict=False) else: print("not using pretrain-weights.") # 是否冻结权重 if args.freeze_layers: print("freeze layers except fc layer.") for name, para in model.named_parameters(): # 除最后的全连接层外,其他权重全部冻结 if "fc" not in name: para.requires_grad_(False) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=0.005) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) # update learning rate scheduler.step() # validate acc = evaluate(model=model, data_loader=val_loader, device=device) # add loss, acc and lr into tensorboard print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["train_loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch) # add figure into tensorboard fig = plot_class_preds(net=model, images_dir="./plot_img", transform=data_transform["val"], num_plot=5, device=device) if fig is not None: tb_writer.add_figure("predictions vs. actuals", figure=fig, global_step=epoch) # add conv1 weights into tensorboard tb_writer.add_histogram(tag="conv1", values=model.conv1.weight, global_step=epoch) tb_writer.add_histogram(tag="layer1/block0/conv1", values=model.layer1[0].conv1.weight, global_step=epoch) # save weights torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") if os.path.exists("./weights") is False: os.makedirs("./weights") tb_writer = SummaryWriter() train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) img_size = 224 data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(img_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(int(img_size * 1.143)), transforms.CenterCrop(img_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_dataset = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=args.num_classes).to(device) if args.weights != "": assert os.path.exists( args.weights), "weights file: '{}' not exist.".format(args.weights) weights_dict = torch.load(args.weights, map_location=device)["model"] # 删除有关分类类别的权重 for k in list(weights_dict.keys()): if "head" in k: del weights_dict[k] print(model.load_state_dict(weights_dict, strict=False)) if args.freeze_layers: for name, para in model.named_parameters(): # 除head外,其他权重全部冻结 if "head" not in name: para.requires_grad_(False) else: print("training {}".format(name)) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.AdamW(pg, lr=args.lr, weight_decay=5E-2) for epoch in range(args.epochs): # train train_loss, train_acc = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) # validate val_loss, val_acc = evaluate(model=model, data_loader=val_loader, device=device, epoch=epoch) tags = [ "train_loss", "train_acc", "val_loss", "val_acc", "learning_rate" ] tb_writer.add_scalar(tags[0], train_loss, epoch) tb_writer.add_scalar(tags[1], train_acc, epoch) tb_writer.add_scalar(tags[2], val_loss, epoch) tb_writer.add_scalar(tags[3], val_acc, epoch) tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") _, _, val_images_path, val_images_label = read_split_data(args.data_path) img_size = 384 data_transform = { "val": transforms.Compose([ transforms.Resize(int(img_size * 1.143)), transforms.CenterCrop(img_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=args.num_classes).to(device) assert os.path.exists( args.weights), "weights file: '{}' not exist.".format(args.weights) model.load_state_dict(torch.load(args.weights, map_location=device)) # read class_indict json_path = './class_indices.json' assert os.path.exists(json_path), "file: '{}' dose not exist.".format( json_path) json_file = open(json_path, "r") class_indict = json.load(json_file) model.eval() with torch.no_grad(): with open("record.txt", "w") as f: # validate data_loader = tqdm(val_loader, file=sys.stdout) for step, data in enumerate(data_loader): images, labels = data pred = model(images.to(device)) pred_classes = torch.max(pred, dim=1)[1] contrast = torch.eq(pred_classes, labels.to(device)).tolist() labels = labels.tolist() pred_classes = pred_classes.tolist() for i, flag in enumerate(contrast): if flag is False: file_name = val_images_path[batch_size * step + i] true_label = class_indict[str(labels[i])] false_label = class_indict[str(pred_classes[i])] f.write( f"{file_name} TrueLabel:{true_label} PredictLabel:{false_label}\n" )
from torch.utils.data.sampler import SubsetRandomSampler from model import densenet, resnet from my_dataset import MyDataSet if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True else: raise Exception(print("No CUDA device available!")) data_path = Path(r"/path/to/data") metadata_path = Path(r"/path/to/metadata") model_path = Path(r"/path/to/model") dataset = MyDataSet(images_dir_path=data_path, csv_file_path=metadata_path, is_train=False) train_size = int(0.8 * len(dataset)) val_size = int(0.2 * len(dataset)) indices = list(range(len(dataset))) train_indices = indices[:train_size] val_indices = indices[train_size:train_size + val_size] val_sampler = SubsetRandomSampler(val_indices) dataloader = torch.utils.data.DataLoader( dataset=dataset, batch_size=1, sampler=val_sampler, ) # model = resnet.generate_model(model_depth=152, n_input_channels=1, n_classes=3)
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(args) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_dataset = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # 如果存在预训练权重则载入 model = densenet121(num_classes=args.num_classes).to(device) if args.weights != "": if os.path.exists(args.weights): load_state_dict(model, args.weights) else: raise FileNotFoundError("not found weights file: {}".format( args.weights)) # 是否冻结权重 if args.freeze_layers: for name, para in model.named_parameters(): # 除最后的全连接层外,其他权重全部冻结 if "classifier" not in name: para.requires_grad_(False) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=1E-4, nesterov=True) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate acc = evaluate(model=model, data_loader=val_loader, device=device) print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args): if torch.cuda.is_available() is False: raise EnvironmentError("not find GPU device for training.") # 初始化各进程环境 init_distributed_mode(args=args) rank = args.rank device = torch.device(args.device) batch_size = args.batch_size num_classes = args.num_classes weights_path = args.weights lr = args.lr if rank == 0: # 在第一个进程中打印信息,并实例化tensorboard print(args) print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/') tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(root) data_transform = { "train": transforms.Compose([transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]), "val": transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])} # 实例化训练数据集 train_data_set = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_data_set = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) # 给每个rank对应的进程分配训练的样本索引 train_sampler = torch.utils.data.distributed.DistributedSampler(train_data_set) val_sampler = torch.utils.data.distributed.DistributedSampler(val_data_set) # 将样本索引每batch_size个元素组成一个list train_batch_sampler = torch.utils.data.BatchSampler( train_sampler, batch_size, drop_last=True) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers if rank == 0: print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader(train_data_set, batch_sampler=train_batch_sampler, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_loader = torch.utils.data.DataLoader(val_data_set, batch_size=batch_size, sampler=val_sampler, pin_memory=True, num_workers=nw, collate_fn=val_data_set.collate_fn) # 实例化模型 model = resnet34(num_classes=num_classes).to(device) # 如果存在预训练权重则载入 if os.path.exists(weights_path): weights_dict = torch.load(weights_path, map_location=device) load_weights_dict = {k: v for k, v in weights_dict.items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(load_weights_dict, strict=False) else: # 如果不存在预训练权重,需要将第一个进程中的权重保存,然后其他进程载入,保持初始化权重一致 if rank == 0: torch.save(model.state_dict(), "./initial_weights.pt") dist.barrier() model.load_state_dict(torch.load("./initial_weights.pt")) # 是否冻结权重 if args.freeze_layers: for name, para in model.named_parameters(): # 除最后的全连接层外,其他权重全部冻结 if "fc" not in name: para.requires_grad_(False) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) # 转为DDP模型 model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # optimizer pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adam(pg, lr=lr) for epoch in range(args.epochs): train_sampler.set_epoch(epoch) mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) sum_num = evaluate(model=model, data_loader=val_loader, device=device) acc = sum_num / val_sampler.total_size if rank == 0: tags = ["loss", "accuracy"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") if os.path.exists("./weights") is False: os.makedirs("./weights") tb_writer = SummaryWriter() train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]) } # 实例化训练数据集 train_dataset = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=5, has_logits=False).to(device) if args.weights != "": assert os.path.exists( args.weights), "weights file: '{}' not exist.".format(args.weights) weights_dict = torch.load(args.weights, map_location=device) # 删除不需要的权重 del_keys = ['head.weight', 'head.bias'] if model.has_logits \ else ['pre_logits.fc.weight', 'pre_logits.fc.bias', 'head.weight', 'head.bias'] for k in del_keys: del weights_dict[k] print(model.load_state_dict(weights_dict, strict=False)) if args.freeze_layers: for name, para in model.named_parameters(): # 除head, pre_logits外,其他权重全部冻结 if "head" not in name and "pre_logits" not in name: para.requires_grad_(False) else: print("training {}".format(name)) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=5E-5) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train train_loss, train_acc = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate val_loss, val_acc = evaluate(model=model, data_loader=val_loader, device=device, epoch=epoch) tags = [ "train_loss", "train_acc", "val_loss", "val_acc", "learning_rate" ] tb_writer.add_scalar(tags[0], train_loss, epoch) tb_writer.add_scalar(tags[1], train_acc, epoch) tb_writer.add_scalar(tags[2], val_loss, epoch) tb_writer.add_scalar(tags[3], val_acc, epoch) tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(): global args, best_prec1 args = parser.parse_args() # create model model_weight = torch.load(FLAGS['pretrain_model']) # 76.130 model = resnet50(model_weight, num_classes=FLAGS['class_num']) summary(model, torch.zeros((1, 3, 224, 224))) model_weight = torch.load( '../2_mixup_kd/checkpoint/fine_tune/model.pth') # 77.58% model.load_state_dict(model_weight) model = model.cuda() cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Data loading code from lmdb data_transforms = { 'train': transforms.Compose([ # transforms.RandomResizedCrop(224), transforms.Resize(256), transforms.RandomCrop((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } data_dir = FLAGS['data_base'] print("| Preparing model...") dsets = {} dsets['train'] = MyDataSet(data_transforms['train']) dsets['val'] = datasets.ImageFolder(os.path.join(data_dir, 'val'), data_transforms['val']) train_loader = torch.utils.data.DataLoader(dsets['train'], batch_size=FLAGS['batch_size'], shuffle=True, num_workers=8, pin_memory=True) val_loader = torch.utils.data.DataLoader(dsets['val'], batch_size=4 * FLAGS['batch_size'], shuffle=False, num_workers=8, pin_memory=True) print('data_loader_success!') # evaluate and train validate(val_loader, model, criterion) if args.evaluate: return D2 = DeepDecipher(len(train_loader), FLAGS['class_num'], 1, int(len(dsets['train']) / 6)) label_list = np.load('label_list.npy') D2.init_label(label_list) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, optimizer, epoch, D2) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) folder_path = 'checkpoint/fine_tune' if not os.path.exists(folder_path): os.makedirs(folder_path) torch.save(model.state_dict(), folder_path + '/model.pth') print('best acc is %.3f' % best_prec1)