def main(args): # Lets cuDNN benchmark conv implementations and choose the fastest. # Only good if sizes stay the same within the main loop! torch.backends.cudnn.benchmark = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #classes = 5 #valid_set, valid_loader = mkval(args) model = models.KNOWN_MODELS[args.model](head_size=args.classes, zero_head=False) model = torch.nn.DataParallel(model) checkpoint = torch.load(args.weight_path, map_location=device) model.load_state_dict(checkpoint["model"]) # Optionally resume from a checkpoint. # Load it to CPU first as we'll move the model to GPU later. # This way, we save a little bit of GPU memory when loading. # Note: no weight-decay! model = model.to(device) model.eval() chrono = lb.Chrono() #run_eval(model, valid_loader, device, chrono, step='end') end = time.time() val_tx = tv.transforms.Compose([ tv.transforms.Resize((448, 448)), tv.transforms.ToTensor(), tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) run_predict(model, args.datadir, val_tx, device)
def main(args): best_acc = -1 logger = bit_common.setup_logger(args) cp, cn = smooth_BCE(eps=0.1) # Lets cuDNN benchmark conv implementations and choose the fastest. # Only good if sizes stay the same within the main loop! torch.backends.cudnn.benchmark = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info(f"Going to train on {device}") classes = 5 train_set, valid_set, train_loader, valid_loader = mktrainval(args, logger) logger.info(f"Loading model from {args.model}.npz") #model = models.KNOWN_MODELS[args.model](head_size=classes, zero_head=True) #model.load_from(np.load(f"{args.model}.npz")) model = EfficientNet.from_pretrained(args.model, num_classes=classes) logger.info("Moving model onto all GPUs") model = torch.nn.DataParallel(model) # Optionally resume from a checkpoint. # Load it to CPU first as we'll move the model to GPU later. # This way, we save a little bit of GPU memory when loading. start_epoch = 0 # Note: no weight-decay! optim = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9) # Resume fine-tuning if we find a saved model. savename = pjoin(args.logdir, args.name, "bit.pth.tar") try: logger.info(f"Model will be saved in '{savename}'") checkpoint = torch.load(savename, map_location="cpu") logger.info(f"Found saved model to resume from at '{savename}'") start_epoch = checkpoint["epoch"] model.load_state_dict(checkpoint["model"]) optim.load_state_dict(checkpoint["optim"]) logger.info(f"Resumed at epoch {start_epoch}") except FileNotFoundError: logger.info("Fine-tuning from BiT") model = model.to(device) optim.zero_grad() model.train() mixup = bit_hyperrule.get_mixup(len(train_set)) #mixup = -1 cri = torch.nn.CrossEntropyLoss().to(device) #cri = FocalLoss(cri) logger.info("Starting training!") chrono = lb.Chrono() accum_steps = 0 mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1 end = time.time() epoches = 10 scheduler = torch.optim.lr_scheduler.OneCycleLR(optim, max_lr=0.01, steps_per_epoch=1, epochs=epoches) with lb.Uninterrupt() as u: for epoch in range(start_epoch, epoches): pbar = enumerate(train_loader) pbar = tqdm.tqdm(pbar, total=len(train_loader)) scheduler.step() all_top1, all_top5 = [], [] for param_group in optim.param_groups: lr = param_group["lr"] #for x, y in recycle(train_loader): for batch_id, (x, y) in pbar: #for batch_id, (x, y) in enumerate(train_loader): # measure data loading time, which is spent in the `for` statement. chrono._done("load", time.time() - end) if u.interrupted: break # Schedule sending to GPU(s) x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) # Update learning-rate, including stop training if over. #lr = bit_hyperrule.get_lr(step, len(train_set), args.base_lr) #if lr is None: # break if mixup > 0.0: x, y_a, y_b = mixup_data(x, y, mixup_l) # compute output with chrono.measure("fprop"): logits = model(x) top1, top5 = topk(logits, y, ks=(1, 5)) all_top1.extend(top1.cpu()) all_top5.extend(top5.cpu()) if mixup > 0.0: c = mixup_criterion(cri, logits, y_a, y_b, mixup_l) else: c = cri(logits, y) train_loss = c.item() train_acc = np.mean(all_top1) * 100.0 # Accumulate grads with chrono.measure("grads"): (c / args.batch_split).backward() accum_steps += 1 accstep = f"({accum_steps}/{args.batch_split})" if args.batch_split > 1 else "" s = f"epoch={epoch} batch {batch_id}{accstep}: loss={train_loss:.5f} train_acc={train_acc:.2f} lr={lr:.1e}" #s = f"epoch={epoch} batch {batch_id}{accstep}: loss={c.item():.5f} lr={lr:.1e}" pbar.set_description(s) #logger.info(f"[batch {batch_id}{accstep}]: loss={c_num:.5f} (lr={lr:.1e})") # pylint: disable=logging-format-interpolation logger.flush() # Update params with chrono.measure("update"): optim.step() optim.zero_grad() # Sample new mixup ratio for next batch mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1 # Run evaluation and save the model. val_loss, val_acc = run_eval(model, valid_loader, device, chrono, logger, epoch) best = val_acc > best_acc if best: best_acc = val_acc torch.save( { "epoch": epoch, "val_loss": val_loss, "val_acc": val_acc, "train_acc": train_acc, "model": model.state_dict(), "optim": optim.state_dict(), }, savename) end = time.time() logger.info(f"Timings:\n{chrono}")
def main(args): logger = common.setup_logger(args) # Lets cuDNN benchmark conv implementations and choose the fastest. # Only good if sizes stay the same within the main loop! torch.backends.cudnn.benchmark = True device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logger.info(f"Going to train on {device}") train_set, valid_set, train_loader, valid_loader = mktrainval(args, logger) logger.info(f"Loading model from {args.model}.npz") model = models.KNOWN_MODELS[args.model](head_size=len(valid_set.classes), zero_head=True) model.load_from( np.load(os.path.join(args.pretrained_dir, f"{args.model}.npz"))) logger.info("Moving model onto all GPUs") model = torch.nn.DataParallel(model) # Optionally resume from a checkpoint. # Load it to CPU first as we'll move the model to GPU later. # This way, we save a little bit of GPU memory when loading. step = 0 # Note: no weight-decay! optim = torch.optim.SGD(model.parameters(), lr=args.base_lr, momentum=0.9) writer = SummaryWriter(os.path.join(args.logdir, args.name)) # Resume fine-tuning if we find a saved model. savename = pjoin(args.logdir, args.name, "model.tar") try: logger.info(f"Model will be saved in '{savename}'") checkpoint = torch.load(savename, map_location="cpu") logger.info(f"Found saved model to resume from at '{savename}'") step = checkpoint["step"] model.load_state_dict(checkpoint["model"]) optim.load_state_dict(checkpoint["optim"]) logger.info(f"Resumed at step {step}") except FileNotFoundError: logger.info("Fine-tuning from BiT") model = model.to(device) optim.zero_grad() model.train() mixup = hyperrule.get_mixup(len(train_set)) cri = torch.nn.CrossEntropyLoss().to(device) logger.info("Starting training!") chrono = lb.Chrono() accum_steps = 0 mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1 end = time.time() with lb.Uninterrupt() as u: for x, y in recycle(train_loader): # measure data loading time, which is spent in the `for` statement. chrono._done("load", time.time() - end) if u.interrupted: break # Schedule sending to GPU(s) x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) # Update learning-rate, including stop training if over. lr = hyperrule.get_lr(step, len(train_set), args.base_lr) if lr is None: break for param_group in optim.param_groups: param_group["lr"] = lr if mixup > 0.0: x, y_a, y_b = mixup_data(x, y, mixup_l) # compute output with chrono.measure("fprop"): logits = model(x) if mixup > 0.0: c = mixup_criterion(cri, logits, y_a, y_b, mixup_l) else: c = cri(logits, y) c_num = float( c.data.cpu().numpy()) # Also ensures a sync point. # Accumulate grads with chrono.measure("grads"): (c / args.batch_split).backward() accum_steps += 1 accstep = f" ({accum_steps}/{args.batch_split})" if args.batch_split > 1 else "" logger.info( f"[step {step}{accstep}]: loss={c_num:.5f} (lr={lr:.1e})") # pylint: disable=logging-format-interpolation logger.flush() writer.add_scalar('Train/loss', c_num, step) writer.add_scalar('Train/lr', lr, step) # Update params if accum_steps == args.batch_split: with chrono.measure("update"): optim.step() optim.zero_grad() step += 1 accum_steps = 0 # Sample new mixup ratio for next batch mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1 # Run evaluation and save the model. if args.eval_every and step % args.eval_every == 0: run_eval(model, valid_loader, device, chrono, logger, writer, step) if args.save and step % args.save_every == 0: step_savename = pjoin(args.logdir, args.name, "model_" + str(step) + ".tar") torch.save( { "step": step, "model": model.state_dict(), "optim": optim.state_dict() }, step_savename) end = time.time() # Final eval at end of training. run_eval(model, valid_loader, device, chrono, logger, writer, step) logger.info(f"Timings:\n{chrono}")