def main(): global args args = parser.parse_args() cc = CrayonClient(port=8089) for name in args.name.split(','): shutil.rmtree(f'weights/{name}/', ignore_errors=True) shutil.rmtree(f'output/{name}/', ignore_errors=True) os.makedirs(f'weights/{name}') for fold in range(NUM_SPLITS): print(f'=> Targeting {name} fold {fold+1}/{NUM_SPLITS}') os.makedirs(f'output/{name}/fold{fold}/train') os.makedirs(f'output/{name}/fold{fold}/valid') arch = name.split('_')[0] model = models[arch](1) model = nn.DataParallel(model) model.cuda() train_loader, valid_loader, _ = get_loaders(args.batch_size, NUM_SPLITS, fold) train_eval(model, name, train_loader, valid_loader, fold, make_experiment(cc, name, fold), init_lr=args.lr, epochs=args.epochs, num_epochs_per_decay=args.num_epochs_per_decay) del model
def __init__(self, args, logger, num=3): self.args = args self.logger = logger self.loaders = get_loaders(args, debug=True) self.model = None self.epoch = None self.num = 3 self.fname = args.fname label = torch.arange(args.way).repeat(args.query) self.label = label.type(torch.cuda.LongTensor) self.p = args.shot * args.way
def main(rank, world_size, arg): logger = Logger(arg.save_dir) setup(rank, world_size) print(rank) scaled_lr = arg.lr * arg.batch_size / 256 arg.batch_size = int(arg.batch_size / world_size) num_workers = int(arg.num_workers / world_size) net, res = get_model(arg, classes=arg.num_classes) logger.will_write(str(arg) + "\n") net.to(rank) net = nn.parallel.DistributedDataParallel(net, device_ids=[rank]) if not arg.dali: train_loader, val_loader = get_loaders(arg.root, arg.batch_size, res, num_workers, arg.val_batch_size, color_jitter=arg.color_jitter, pca=arg.pca, crop_pct=arg.crop_pct) else: train_loader, val_loader = get_loaders_dali(arg.root, arg.batch_size, res, rank, world_size, num_workers) # net = nn.DataParallel(net).to(torch_device) loss = nn.CrossEntropyLoss() if not arg.no_filter_bias: parameters = add_weight_decay(net, weight_decay=arg.decay) weight_decay = 0 print('filter out bias, bn and other 1d params from weight decay') else: parameters = net.parameters() weight_decay = arg.decay optim = { # "adam" : lambda : torch.optim.Adam(net.parameters(), lr=arg.lr, betas=arg.beta, weight_decay=arg.decay), "sgd": lambda : torch.optim.SGD(parameters, lr=scaled_lr, momentum=arg.momentum, nesterov=True, weight_decay=weight_decay), "rmsproptf": lambda : RMSpropTF(parameters, lr=scaled_lr, momentum=arg.momentum, eps=arg.eps, weight_decay=weight_decay), "rmsprop" : lambda : torch.optim.RMSprop(parameters, lr=scaled_lr, momentum=arg.momentum, eps=arg.eps, weight_decay=weight_decay) }[arg.optim]() scheduler = get_scheduler(optim, arg.scheduler, int(1.0 * len(train_loader)), arg.epoch * len(train_loader), warmup_t=int(arg.warmup * len(train_loader)), warmup_lr_init=0.1 * scaled_lr) arg.epoch = arg.epoch + arg.cool_down if arg.cool_down > 0 else arg.epoch model = Runner(arg, net, optim, rank, loss, logger, scheduler, world_size) if arg.profiler: model.profiler(train_loader, val_loader, train_loader.sampler) elif arg.test is False: if not arg.dali: model.train(train_loader, val_loader, train_loader.sampler) else: model.train(train_loader, val_loader) cleanup()
def __init__(self, path=None): self.net = model.CompareAggregate() if config.use_cuda: self.net = self.net.cuda() if path != None: self.net.load_weight(path) self.train_loader, self.validation_loader, self.test_loader = loader.get_loaders( ) self.summary = Summary(len(self.train_loader)) self.optim = torch.optim.Adam(self.net.parameters(), lr=config.lr) self.epoch = self.step = 0 self.last_map = 0 print('Inititalize done')
if __name__ == "__main__": arg = arg_parse() arg.save_dir = "%s/outs/%s" % (os.getcwd(), arg.save_dir) if os.path.exists(arg.save_dir) is False: os.mkdir(arg.save_dir) logger = Logger(arg.save_dir) logger.will_write(str(arg) + "\n") os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpus torch_device = torch.device("cuda") train_loader, val_loader = get_loaders(arg.root, arg.batch_size, 224, arg.num_workers) if arg.model == "mixs": net = mixnet_s() else: pass net = nn.DataParallel(net).to(torch_device) loss = nn.CrossEntropyLoss() scaled_lr = arg.lr * arg.batch_size / 256 optim = { "adam" : lambda : torch.optim.Adam(net.parameters(), betas=arg.beta, weight_decay=arg.decay), "rmsprop" : lambda : torch.optim.RMSprop(net.parameters(), lr=scaled_lr, momentum=arg.momentum, eps=arg.eps, weight_decay=arg.decay) }[arg.optim]()
def debug_sample(args): db_loaders = get_loaders(args, debug=True) sample = {'train': _debug_sample(db_loaders['train'], args.way * args.shot), 'val': _debug_sample(db_loaders['test'], args.way *args.shot)} return sample
if __name__ == "__main__": arg = arg_parse() arg.save_dir = "%s/outs/%s" % (os.getcwd(), arg.save_dir) if os.path.exists(arg.save_dir) is False: os.mkdir(arg.save_dir) logger = Logger(arg.save_dir) logger.will_write(str(arg) + "\n") os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpus device = torch.device("cuda") train_loader, val_loader = get_loaders(arg.root, arg.batch_size, arg.num_workers, dtype=arg.dtype) if arg.model == "mixs": net = mixnet_s(num_classes=len(train_loader.dataset.classes)) elif arg.model == "rw": import sys sys.path.append("rwightman") from timm.models.gen_efficientnet import mixnet_s net = mixnet_s(num_classes=len(train_loader.dataset.classes)) else: from torchvision.models import resnet50 net = resnet50(num_classes=len(train_loader.dataset.classes)) net = nn.DataParallel(net) loss = nn.CrossEntropyLoss()
def main(): model_names = sorted(name for name in models.__dict__\ if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) and ( name.startswith("vgg") or name.startswith("alexnet"))) parser = argparse.ArgumentParser() parser.add_argument( "data_dir", help="name of the directory from where to load the data") parser.add_argument("--save_dir", help="directory to save checkpoints(default :none)", metavar='save') parser.add_argument("--arch", choices=model_names, default="vgg16", help="Choose Architecture (default:vgg16)") parser.add_argument("--gpu", action="store_true", help="Use GPU for training") parser.add_argument("--learning_rate", type=float, default=0.003, metavar='lr', dest='learning_rate', help="Learning Rate(default:0.003)") parser.add_argument("--epochs", type=int, default=1, dest='epochs', help="Number of Epochs for training(default:1)") parser.add_argument( "--print_every", type=int, default=5, metavar='P', dest='validate_every', help="Number of steps after which output should be printed(default:5)") parser.add_argument( "--skip_after", type=int, dest='skip_after', metavar='skip', help= "Number of steps after which training module should be exited(default:None)" ) parser.add_argument( "--hidden_unit_1", "-fc1", metavar='fc1', type=int, dest='n_fc1', default=4096, help="Number of hidden units for layer 1(default:4096)") parser.add_argument( "--hidden_unit_2", "-fc2", metavar='fc2', type=int, default=2048, dest='n_fc2', help="Number of hidden units for layer 2(defaukt:2048)") parser.add_argument( "--hidden_unit_3", "-fc3", metavar='fc3', type=int, default=1024, dest='n_fc3', help="Number of hidden units for layer 3(default:1024)") args = parser.parse_args() if args.save_dir and not os.path.exists(args.save_dir): print("save directory doesn't exist.please try again") sys.exit(-1) if os.path.exists(args.data_dir): traindataloaders, validationdataloaders, class_to_idx = loader.get_loaders( args.data_dir) if traindataloaders and validationdataloaders: device = torch.device( 'cuda' if torch.cuda.is_available() and args.gpu else 'cpu') model = image_classifier.init_classifier(args.arch, device, args.n_fc1, args.n_fc2, args.n_fc3) model.classifier.class_to_idx = class_to_idx optimizer = optim.Adam(model.classifier.parameters(), lr=args.learning_rate) criterion = nn.NLLLoss() dataloaders = [traindataloaders, validationdataloaders] with active_session(): training_loss = image_classifier.train( model, dataloaders, criterion, device, optimizer, epochs=args.epochs, validate_every=args.validate_every, skip_after=args.skip_after) if args.save_dir: image_classifier.save_checkpoint(model, optimizer, args.epochs, training_loss, args.save_dir, args.arch) else: print( "data couldn't be read or no valid train or valid directory.Pleae check if /train and /valid exists" ) else: print("data directory entered doesn't exists.Please try again")
def train_model(cfg: Config, weight_path=None, device='cuda:0'): now = datetime.datetime.now() log_dir = os.path.join(MODEL_DIR, f'{cfg.NAME.lower()}_{now:%Y%m%dT%H%M}') os.makedirs(log_dir, exist_ok=True) writer = SummaryWriter(log_dir=log_dir, flush_secs=5) # snapshot with open(os.path.join(log_dir, 'snapshot.txt'), 'w') as f: snapshot = cfg.get_snapshot() json.dump(snapshot, f, indent=4) model = get_model(cfg) model.to(device) if weight_path is not None: model.load_state_dict(torch.load(weight_path)) if cfg.LOSS == 'ce': weight = None criterion = nn.CrossEntropyLoss(weight=weight) elif cfg.LOSS == 'bce': criterion = nn.BCEWithLogitsLoss( pos_weight=torch.tensor(0.5, dtype=torch.float)) elif cfg.LOSS == 'focal_loss': criterion = BCEFocalLoss() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if cfg.OPTIMIZER == 'sgd': optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=cfg.BASE_LR, momentum=0.9, weight_decay=5e-4) elif cfg.OPTIMIZER == 'adam': optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=cfg.BASE_LR, weight_decay=5e-4) if cfg.SCHEDULER == 'step': scheduler = StepLR(optimizer, step_size=5, gamma=0.5) elif cfg.SCHEDULER == 'multstep': scheduler = MultiStepLR(optimizer, milestones=(20, 40), gamma=0.1) gloabl_step = 0 train_loader, val_loader = get_loaders(cfg) for epoch in range(1, cfg.EPOCHS + 1): batch_loss = [] train_loss = [] model.train() # scheduler(optimizer, epoch) scheduler.step(epoch) pr, gt = [], [] for i, (inputs, labels) in enumerate(train_loader): inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) loss = criterion(outputs, labels) pr.extend( torch.round( torch.sigmoid(outputs)).detach().cpu().numpy().squeeze()) gt.extend(labels.cpu().numpy().squeeze()) # print(pr) # print(gt) batch_loss.append(loss.item()) loss = loss / cfg.ACCUMULATION_STEPS loss.backward() if (i + 1) % cfg.ACCUMULATION_STEPS == 0: # scheduler.step() # adjust_learning_rate(optimizer, cfg.BASE_LR, gloabl_step, epoch, # warmup_iters=len(train_loader) // (cfg.ACCUMULATION_STEPS * cfg.IMAGE_PER_GPU) * 5) optimizer.step() optimizer.zero_grad() gloabl_step += 1 train_loss.append(np.mean(batch_loss)) batch_loss = np.mean(batch_loss) lr = optimizer.state_dict()['param_groups'][0]['lr'] writer.add_scalar('lr', lr, gloabl_step) print( f'epoch {epoch:5d} batch {(i + 1) // cfg.ACCUMULATION_STEPS:5d}, loss:{np.mean(batch_loss):.4f}, lr:{lr:.4e}' ) writer.add_scalar('batch_loss', np.mean(batch_loss), gloabl_step) batch_loss = [] # break train_acc = accuracy_score(gt, pr) train_recall = recall_score(gt, pr) train_precision = precision_score(gt, pr) print(confusion_matrix(gt, pr)) print( f'epoch {epoch} mean_loss:{np.mean(train_loss):.4f} acc:{train_acc:.4f} recall:{train_recall:.4f} ' f'precision:{train_precision:.4f} pos_num:{sum(gt)} neg_num:{len(gt) - sum(gt)}' ) model.eval() val_loss = [] pr, gt = [], [] with torch.no_grad(): for j, (inputs, labels) in enumerate(val_loader): inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) loss = criterion(outputs, labels) val_loss.append(loss.item()) pr.extend( torch.round(torch.sigmoid( outputs)).detach().cpu().numpy().squeeze()) gt.extend(labels.cpu().numpy().squeeze()) val_acc = accuracy_score(gt, pr) val_recall = recall_score(gt, pr) val_precision = precision_score(gt, pr) print(confusion_matrix(gt, pr)) print( f'epoch {epoch} val_loss:{np.mean(val_loss):.4f} acc:{val_acc:.4f} recall:{val_recall:.4f} ' f'precision:{val_precision:.4f} pos_num:{sum(gt)} neg_num:{len(gt) - sum(gt)}' ) checkpoint_path = os.path.join( log_dir, "{}_{:04d}_{:.4f}.pth".format(cfg.NAME.lower(), epoch, np.mean(val_acc))) writer.add_scalars('loss', { 'loss': np.mean(train_loss), 'val_loss': np.mean(val_loss) }, epoch) writer.add_scalars( 'acc', { 'train_acc': train_acc, 'train_precsion': train_precision, 'train_reall': train_recall }, epoch) writer.add_scalars( 'val_acc', { 'val_acc': val_acc, 'val_precsion': val_precision, 'val_reall': val_recall }, epoch) torch.save(model.state_dict(), checkpoint_path) writer.close()