def main() -> None: args = get_arguments() # configuration config = get_config(args.config) # save log files in the directory which contains config file. result_path = os.path.dirname(args.config) experiment_name = os.path.basename(result_path) # cpu or cuda device = get_device(allow_only_gpu=True) # Dataloader train_transform = Compose([ RandomResizedCrop(size=(config.height, config.width)), RandomHorizontalFlip(), ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1), ToTensor(), Normalize(mean=get_mean(), std=get_std()), ]) val_transform = Compose( [ToTensor(), Normalize(mean=get_mean(), std=get_std())]) train_loader = get_dataloader( config.train_csv, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=True, drop_last=True, transform=train_transform, ) val_loader = get_dataloader( config.val_csv, batch_size=1, shuffle=False, num_workers=config.num_workers, pin_memory=True, transform=val_transform, ) # the number of classes n_classes = len(get_cls2id_map()) # define a model model = get_model(config.model, n_classes, pretrained=config.pretrained) # send the model to cuda/cpu model.to(device) optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) # keep training and validation log begin_epoch = 0 best_loss = float("inf") log = pd.DataFrame(columns=[ "epoch", "lr", "train_time[sec]", "train_loss", "train_acc@1", "train_f1s", "val_time[sec]", "val_loss", "val_acc@1", "val_f1s", ]) # resume if you want if args.resume: resume_path = os.path.join(result_path, "checkpoint.pth") begin_epoch, model, optimizer, best_loss = resume( resume_path, model, optimizer) log_path = os.path.join(result_path, "log.csv") assert os.path.exists( log_path), "there is no checkpoint at the result folder" log = pd.read_csv(log_path) # criterion for loss criterion = get_criterion(config.use_class_weight, config.train_csv, device) # Weights and biases if not args.no_wandb: wandb.init( name=experiment_name, config=config, project="image_classification_template", job_type="training", dirs="./wandb_result/", ) # Magic wandb.watch(model, log="all") # train and validate model print("---------- Start training ----------") for epoch in range(begin_epoch, config.max_epoch): # training start = time.time() train_loss, train_acc1, train_f1s = train(train_loader, model, criterion, optimizer, epoch, device) train_time = int(time.time() - start) # validation start = time.time() val_loss, val_acc1, val_f1s, c_matrix = evaluate( val_loader, model, criterion, device) val_time = int(time.time() - start) # save a model if top1 acc is higher than ever if best_loss > val_loss: best_loss = val_loss torch.save( model.state_dict(), os.path.join(result_path, "best_model.prm"), ) # save checkpoint every epoch save_checkpoint(result_path, epoch, model, optimizer, best_loss) # write logs to dataframe and csv file tmp = pd.Series( [ epoch, optimizer.param_groups[0]["lr"], train_time, train_loss, train_acc1, train_f1s, val_time, val_loss, val_acc1, val_f1s, ], index=log.columns, ) log = log.append(tmp, ignore_index=True) log.to_csv(os.path.join(result_path, "log.csv"), index=False) # save logs to wandb if not args.no_wandb: wandb.log( { "lr": optimizer.param_groups[0]["lr"], "train_time[sec]": train_time, "train_loss": train_loss, "train_acc@1": train_acc1, "train_f1s": train_f1s, "val_time[sec]": val_time, "val_loss": val_loss, "val_acc@1": val_acc1, "val_f1s": val_f1s, }, step=epoch, ) print("""epoch: {}\tepoch time[sec]: {}\tlr: {}\ttrain loss: {:.4f}\t\ val loss: {:.4f} val_acc1: {:.5f}\tval_f1s: {:.5f} """.format( epoch, train_time + val_time, optimizer.param_groups[0]["lr"], train_loss, val_loss, val_acc1, val_f1s, )) # save models torch.save(model.state_dict(), os.path.join(result_path, "final_model.prm")) # delete checkpoint os.remove(os.path.join(result_path, "checkpoint.pth")) print("Done")
def main() -> None: # argparser args = get_arguments() # configuration config = get_config(args.config) result_path = os.path.dirname(args.config) seed = args.seed random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True # cpu or cuda device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cuda": torch.backends.cudnn.benchmark = True # Dataloader # Temporal downsampling is applied to only videos in 50Salads downsamp_rate = 2 if config.dataset == "50salads" else 1 train_data = ActionSegmentationDataset( config.dataset, transform=Compose([ToTensor(), TempDownSamp(downsamp_rate)]), mode="trainval" if not config.param_search else "training", split=config.split, dataset_dir=config.dataset_dir, csv_dir=config.csv_dir, ) train_loader = DataLoader( train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True if config.batch_size > 1 else False, collate_fn=collate_fn, ) # if you do validation to determine hyperparams if config.param_search: val_data = ActionSegmentationDataset( config.dataset, transform=Compose([ToTensor(), TempDownSamp(downsamp_rate)]), mode="validation", split=config.split, dataset_dir=config.dataset_dir, csv_dir=config.csv_dir, ) val_loader = DataLoader( val_data, batch_size=1, shuffle=False, num_workers=config.num_workers, collate_fn=collate_fn, ) # load model print("---------- Loading Model ----------") n_classes = get_n_classes(config.dataset, dataset_dir=config.dataset_dir) model = models.ActionSegmentRefinementFramework( in_channel=config.in_channel, n_features=config.n_features, n_classes=n_classes, n_stages=config.n_stages, n_layers=config.n_layers, n_stages_asb=config.n_stages_asb, n_stages_brb=config.n_stages_brb, ) # send the model to cuda/cpu model.to(device) optimizer = get_optimizer( config.optimizer, model, config.learning_rate, momentum=config.momentum, dampening=config.dampening, weight_decay=config.weight_decay, nesterov=config.nesterov, ) # resume if you want columns = ["epoch", "lr", "train_loss"] # if you do validation to determine hyperparams if config.param_search: columns += ["val_loss", "cls_acc", "edit"] columns += [ "segment f1s@{}".format(config.iou_thresholds[i]) for i in range(len(config.iou_thresholds)) ] columns += ["bound_acc", "precision", "recall", "bound_f1s"] begin_epoch = 0 best_loss = float("inf") log = pd.DataFrame(columns=columns) if args.resume: if os.path.exists(os.path.join(result_path, "checkpoint.pth")): checkpoint = resume(result_path, model, optimizer) begin_epoch, model, optimizer, best_loss = checkpoint log = pd.read_csv(os.path.join(result_path, "log.csv")) print("training will start from {} epoch".format(begin_epoch)) else: print("there is no checkpoint at the result folder") # criterion for loss if config.class_weight: class_weight = get_class_weight( config.dataset, split=config.split, dataset_dir=config.dataset_dir, csv_dir=config.csv_dir, mode="training" if config.param_search else "trainval", ) class_weight = class_weight.to(device) else: class_weight = None criterion_cls = ActionSegmentationLoss( ce=config.ce, focal=config.focal, tmse=config.tmse, gstmse=config.gstmse, weight=class_weight, ignore_index=255, ce_weight=config.ce_weight, focal_weight=config.focal_weight, tmse_weight=config.tmse_weight, gstmse_weight=config.gstmse, ) pos_weight = get_pos_weight( dataset=config.dataset, split=config.split, csv_dir=config.csv_dir, mode="training" if config.param_search else "trainval", ).to(device) criterion_bound = BoundaryRegressionLoss(pos_weight=pos_weight) # train and validate model print("---------- Start training ----------") for epoch in range(begin_epoch, config.max_epoch): # training train_loss = train( train_loader, model, criterion_cls, criterion_bound, config.lambda_b, optimizer, epoch, device, ) # if you do validation to determine hyperparams if config.param_search: ( val_loss, cls_acc, edit_score, segment_f1s, bound_acc, precision, recall, bound_f1s, ) = validate( val_loader, model, criterion_cls, criterion_bound, config.lambda_b, device, config.dataset, config.dataset_dir, config.iou_thresholds, config.boundary_th, config.tolerance, ) # save a model if top1 acc is higher than ever if best_loss > val_loss: best_loss = val_loss torch.save( model.state_dict(), os.path.join(result_path, "best_loss_model.prm"), ) # save checkpoint every epoch save_checkpoint(result_path, epoch, model, optimizer, best_loss) # write logs to dataframe and csv file tmp = [epoch, optimizer.param_groups[0]["lr"], train_loss] # if you do validation to determine hyperparams if config.param_search: tmp += [ val_loss, cls_acc, edit_score, ] tmp += segment_f1s tmp += [ bound_acc, precision, recall, bound_f1s, ] tmp_df = pd.Series(tmp, index=log.columns) log = log.append(tmp_df, ignore_index=True) log.to_csv(os.path.join(result_path, "log.csv"), index=False) if config.param_search: # if you do validation to determine hyperparams print( "epoch: {}\tlr: {:.4f}\ttrain loss: {:.4f}\tval loss: {:.4f}\tval_acc: {:.4f}\tedit: {:.4f}" .format( epoch, optimizer.param_groups[0]["lr"], train_loss, val_loss, cls_acc, edit_score, )) else: print("epoch: {}\tlr: {:.4f}\ttrain loss: {:.4f}".format( epoch, optimizer.param_groups[0]["lr"], train_loss)) # delete checkpoint os.remove(os.path.join(result_path, "checkpoint.pth")) # save models torch.save(model.state_dict(), os.path.join(result_path, "final_model.prm")) print("Done!")
def main(): args = get_arguments() config = get_config(args.config) result_path = os.path.dirname(args.config) experiment_name = os.path.basename(result_path) if os.path.exists(os.path.join(result_path, "final_model.prm")): print("Already done.") return device = get_device(allow_only_gpu=True) transform = DataTransform(config.size, get_mean()) voc_classes = [k for k in get_cls2id_map().keys()] train_loader = get_dataloader( config.train_csv, phase="train", batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=True, drop_last=True, transform=transform, transform_anno=Anno_xml2list(voc_classes), ) val_loader = get_dataloader( config.val_csv, phase="val", batch_size=1, shuffle=True, num_workers=config.num_workers, pin_memory=True, drop_last=True, transform=transform, transform_anno=Anno_xml2list(voc_classes), ) n_classes = len(voc_classes) + 1 model = get_model( input_size=config.size, n_classes=n_classes, phase="train", pretrained=config.pretrained, ) model.to(device) optimizer = optim.SGD( model.parameters(), lr=config.learning_rate, momentum=0.9, weight_decay=5e-4 ) begin_epoch = 0 best_loss = float("inf") # TODO 評価指標の検討 log = pd.DataFrame( columns=[ "epoch", "lr", "train_time[sec]", "train_loss", "val_time[sec]", "val_loss", ] ) if args.resume: resume_path = os.path.join(result_path, "checkpoint.pth") begin_epoch, model, optimizer, best_loss = resume(resume_path, model, optimizer) log_path = os.path.join(result_path, "log.csv") assert os.path.exists(log_path), "there is no checkpoint at the result folder" log = pd.read_csv(log_path) criterion = get_criterion(device=device) print("---------- Start training ----------") for epoch in range(begin_epoch, config.max_epoch): start = time.time() train_loss = train( train_loader, model, criterion, optimizer, epoch, device, interval_of_progress=10, ) train_time = int(time.time() - start) start = time.time() val_loss = evaluate( val_loader, model, criterion, device, ) val_time = int(time.time() - start) if best_loss > val_loss: best_loss = val_loss torch.save( model.state_dict(), os.path.join(result_path, "best_model.prm"), ) save_checkpoint(result_path, epoch, model, optimizer, best_loss) tmp = pd.Series( [ epoch, optimizer.param_groups[0]["lr"], train_time, train_loss, val_time, val_loss, ], index=log.columns, ) log = log.append(tmp, ignore_index=True) log.to_csv(os.path.join(result_path, "log.csv"), index=False) make_graphs(os.path.join(result_path, "log.csv")) print( """epoch: {}\tepoch time[sec]: {}\tlr: {}\ttrain loss: {:.4f}\t\ val loss: {:.4f} """.format( epoch, train_time + val_time, optimizer.param_groups[0]["lr"], train_loss, val_loss, ) ) torch.save(model.state_dict(), os.path.join(result_path, "final_model.prm")) os.remove(os.path.join(result_path, "checkpoint.pth")) print("Done")
def main(): args = get_arguments() config = get_config(args.config) result_path = os.path.dirname(args.config) experiment_name = os.path.basename(result_path) if os.path.exists(os.path.join(result_path, "final_model_G.prm")): print("Already done.") return device = get_device(allow_only_gpu=True) train_loader = get_dataloader( csv_file=config.train_csv, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=True, drop_last=True, transform=ImageTransform(mean=get_mean(), std=get_std()), ) model = get_model(config.model, z_dim=config.z_dim, image_size=config.size) for v in model.values(): v.to(device) g_optimizer = torch.optim.Adam( model["G"].parameters(), config.g_lr, [config.beta1, config.beta2], ) d_optimizer = torch.optim.Adam( model["D"].parameters(), config.d_lr, [config.beta1, config.beta2], ) optimizer = { "G": g_optimizer, "D": d_optimizer, } begin_epoch = 0 best_loss = float("inf") # TODO 評価指標の検討 log = pd.DataFrame( columns=[ "epoch", "d_lr", "g_lr", "train_time[sec]", "train_loss", "train_d_loss", "train_g_loss", ] ) if args.resume: resume_path = os.path.join(result_path, "checkpoint_%s.pth") begin_epoch, model, optimizer, best_loss = resume(resume_path, model, optimizer) log_path = os.path.join(result_path, "log.csv") assert os.path.exists(log_path), "there is no checkpoint at the result folder" log = pd.read_csv(log_path) criterion = nn.BCEWithLogitsLoss(reduction="mean") print("---------- Start training ----------") for epoch in range(begin_epoch, config.max_epoch): start = time.time() train_d_loss, train_g_loss, = train( train_loader, model, config.model, criterion, optimizer, epoch, config.z_dim, device, interval_of_progress=1, ) train_time = int(time.time() - start) if best_loss > train_d_loss + train_g_loss: best_loss = train_d_loss + train_g_loss for k in model.keys(): torch.save( model[k].state_dict(), os.path.join(result_path, "best_model_%s.prm" % k), ) save_checkpoint(result_path, epoch, model, optimizer, best_loss) tmp = pd.Series( [ epoch, optimizer["D"].param_groups[0]["lr"], optimizer["G"].param_groups[0]["lr"], train_time, train_d_loss + train_g_loss, train_d_loss, train_g_loss, ], index=log.columns, ) log = log.append(tmp, ignore_index=True) log.to_csv(os.path.join(result_path, "log.csv"), index=False) make_graphs(os.path.join(result_path, "log.csv")) print( "epoch: {}\tepoch time[sec]: {}\tD_lr: {}\tG_lr: {}\ttrain loss: {:.4f}\ttrain d_loss: {:.4f}\ttrain g_loss: {:.4f}".format( epoch, train_time, optimizer["D"].param_groups[0]["lr"], optimizer["G"].param_groups[0]["lr"], train_d_loss + train_g_loss, train_d_loss, train_g_loss, ) ) for k in model.keys(): torch.save( model[k].state_dict(), os.path.join(result_path, "final_model_%s.prm" % k), ) for k in model.keys(): os.remove(os.path.join(result_path, "checkpoint_%s.pth" % k)) print("Done")