def train(optimizer, num_classes, num_epochs, scheduler, device): load = get_dataset() model = get_model_instance_segmentation(num_classes) model = model.to(device) if optimizer == 'Adam': exp_optimizer = optim.Adam(model.parameters(), lr=1e-3) else: exp_optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005) if scheduler: lr_scheduler = optim.lr_scheduler.StepLR(exp_optimizer, step_size=3, gamma=0.1) for epoch in range(num_epochs): train_one_epoch(model, exp_optimizer, load['train'], device, epoch, print_freq=10) lr_scheduler.step() evaluate(model, load['val'], device=device) torch.save(model.state_dict(), 'best_model') print('Finished')
# Nº of classes: background, with_mask, mask_weared_incorrect, without_mask and build model (faster r-cnn) num_classes = 4 model = helper.build_model(num_classes) model = model.to(device) # Get saved model model.load_state_dict(torch.load(PATH)) # ----------------------------------------------- Evaluation & Predictions --------------------------------------------- # put the model in evaluation mode model.eval() # Evaluate the model evaluate(model, loader_test, device=device) # Make prediction on random image n = randint(0, dataset_test.len) img, target = dataset_test[n] with torch.no_grad(): prediction = model([img.to(device)])[0] # Non max suppression to reduce the number of bounding boxes nms_prediction = helper.apply_nms(prediction, iou_thresh=0.5) # Remove low score boxes filtered_prediction = helper.remove_low_score_bb(nms_prediction, score_thresh=0.2) # Draw bounding boxes helper.draw_bounding_boxes(img.detach().cpu(),
# train for one epoch, printing every <print_freq> iterations training_results, train_iterations_log = train_one_epoch( model, optimizer, loader_train, device, epoch, print_freq=1, df=train_iterations_log) # add epoch logs to df train_epochs_log = helper.df_add_epoch_log(train_epochs_log, epoch, training_results) # evaluate on the validation data set mAP = evaluate(model, loader_validation, device=device) # Check to keep best model if mAP > best_mAP: best_mAP = mAP # Save model torch.save(model.state_dict(), PATH + '/' + filename + '.pt') # update the learning rate lr_scheduler.step() # ----------------------------------------------- Save Training Logs --------------------------------------------------- # Save training logs train_epochs_log.to_csv(PATH + '/' + filename + '_epochs.csv', index=False,
# Training loop params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) num_epochs = 10 for e in range(num_epochs): # train for one epoch train_one_epoch(model, optimizer, train_loader, device, e, print_freq=10) # update learning rate lr_scheduler.step() # evaluate on the test dataset print('entering eval') print(len(val_loader)) evaluate(model, val_loader, device=device) # save model if e % 10 == 0: torch.save({ 'epoch': e, 'model_state_dict': model.state_dict() }, f'leaf_od' + str(e) + 'EPOCH_checkpoint.pt')
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") # model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes, # pretrained=args.pretrained) model = get_model(num_classes=num_classes) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: print("----------------------Resume--------------") checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): input_size = (224, 224) best_acc = 0.0 # prepare output folder if args.output_dir: if not Path(args.output_dir).is_dir(): Path(args.output_dir).mkdir() # read config with open(args.cfg, 'r') as f: cfg_dict = yaml.load(f, Loader=yaml.FullLoader) config_stem = Path(args.cfg).stem hyp = cfg_dict['hyp'] data = cfg_dict['data'] names = np.unique( data['names'] ) # sort as sklearn.preprocessing.LabelEncoder.fit_transform() does # set device mode device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # create model model_name = args.model nc = data['nc'] feature_extract = hyp['feature_extract'] print('[INFO] Creating model ({})'.format(model_name)) model, input_size = initialize_model(model_name, nc, feature_extract) model.to(device) # load data print('[INFO] Loading data') train_csv = data['train'] val_csv = data['val'] train_dataset, val_dataset, train_sampler = load_data_from_csv( train_csv, val_csv, input_size, args.transform) # dataloader batch_size = hyp['batch_size'] train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=args.workers) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=args.workers) # criterion + optimizer + scheduler learning_rate = hyp['lr'] momentum = hyp['momentum'] weight_decay = hyp['weight_decay'] criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay) # scheduler = optim.lr_scheduler.MultiStepLR(optimizer,milestones=[0.5*args.total_epochs, 0.8*args.total_epochs], gamma=0.1) # create tensorboard writter logdir = f'runs/{model_name}_{config_stem}' writter = SummaryWriter(log_dir=logdir) if args.resume: print('[INFO] Load checkpoint') ckpt = torch.load(args.resume, map_location=device) model.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optimizer']) args.start_epoch = ckpt['epoch'] + 1 best_acc = ckpt['best_acc'] if 'best_acc' in ckpt else ckpt['acc'] if args.eval: ckpt_ = torch.load(args.eval, map_location=device) model.load_state_dict(ckpt_['model']) evaluate(val_loader, model, names, device) return # train start_epoch = args.start_epoch total_epochs = hyp['total_epochs'] try: print('[INFO] Starting training') start_time = time.time() for epoch in range(start_epoch, total_epochs): epoch_info = f'Epoch {epoch}/{total_epochs-1}' print(epoch_info) print('-' * len(epoch_info)) # train engine train_acc, train_loss = train_one_epoch(train_loader, model, criterion, optimizer, epoch, device) val_acc, val_loss = validate(val_loader, model, criterion, device) # scheduler.step() # logging to tensorboard writter.add_scalar('Loss/train', train_loss, epoch) writter.add_scalar('Loss/val', val_loss, epoch) writter.add_scalar('Acc/train', train_acc, epoch) writter.add_scalar('Acc/val', val_acc, epoch) # print training info info = f'loss ' + f'{train_loss:.3f} ' + f'accuracy ' + f'{train_acc:.1f}% ' \ + f'val_loss ' + f'{val_loss:.3f} ' + f'val_accuracy ' + f'{val_acc:.1f}%' + '\n' print(info) is_best = val_acc > best_acc if is_best: best_acc = val_acc print('Found new best val_acc: {:6.2f}!\n'.format(best_acc)) # save checkpoint each 10 epochs checkpoint = { 'epoch': epoch, 'acc': val_acc, 'model': model, 'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } filepath = str( Path(args.output_dir).joinpath( f'{model_name}_{config_stem}.pt')) save_checkpoint(checkpoint, filepath, epoch, is_best) except KeyboardInterrupt: print('[INFO] Training interrupted. Saving checkpoint') print('[INFO] Best val_acc: {:.2f}'.format(best_acc)) filepath = str( Path(args.output_dir).joinpath( f'{model_name}_{config_stem}_{epoch-1}.pt')) save_checkpoint(checkpoint, filepath, epoch, force_save=True) writter.flush() writter.close() sys.exit(0) # flush and close tensorboard writter writter.flush() writter.close() elapsed_time = time.time() - start_time elapsed_str = str(datetime.timedelta(seconds=int(elapsed_time))) print('[INFO] Training complete in: {}'.format(elapsed_str)) print('[INFO] Best val_acc: {:.2f}'.format(best_acc)) filepath = str( Path(args.output_dir).joinpath(f'{model_name}_{config_stem}_final.pt')) save_checkpoint(checkpoint, filepath, epoch, force_save=True)
num_classes = 2 model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels hidden_layer = 256 model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes) # Training device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005) EPOCHS = 10 for epoch in range(EPOCHS): train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=10) evaluate(model, test_loader, device=device)
num_workers=4) val_data = AnnoData('../datasets/dev.json', name_to_com, id_to_com, threshold=args.threshold, result_dir=result_dir) val_loader = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=4) print('datasets successfully loaded!\n') print('config model and optim...') model = NerdModel(config) # torch.cuda.empty_cache() model = model.cuda() model = torch.nn.DataParallel(model) optim = get_optim(args, model) #, momentum=0.98, weight_decay=2e-5) criterion = nn.BCEWithLogitsLoss() print('start training!') for epoch in range(args.epochs): if epoch == args.decay_epoch: adjust_lr(optim, args.lr_decay) print('\nEpoch: %d, LR: %e' % (epoch, optim.param_groups[0]['lr'])) train(model, optim, criterion, train_loader) f1 = evaluate(model, val_data, val_loader, epoch) if f1 > 0.97: torch.save(model.state_dict(), result_dir + '/ckpts/epoch%d_%5f.pkl' % (epoch, f1))
# and a learning rate scheduler which decreases the learning rate by # 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1) # TRAINING LOOP save_fr = 1 print_freq = 25 # make sure that print_freq is smaller than len(dataset) & len(dataset_test) os.makedirs('./maskrcnn_saved_models', exist_ok=True) for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=print_freq) if epoch % save_fr == 0: torch.save( model.state_dict(), './maskrcnn_saved_models/mask_rcnn_model_epoch_{}.pt'.format( str(epoch))) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device)
def main(): env_dict = fetch_env_dict() model = MODEL_DISPATCHER[env_dict["BASE_MODEL"]](pretrained=True) model.to(env_dict["DEVICE"]) parent = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) df = pd.read_csv(os.path.join(parent, "data/train_full.csv")) # TODO(Sayar): Remove hacky code here train_image_paths = df[df["kfold"].isin( env_dict["TRAINING_FOLDS"])]["img_path"].values.tolist() val_image_paths = df[df["kfold"].isin( env_dict["VALIDATION_FOLDS"])]["img_path"].values.tolist() train_image_paths = [ os.path.join(os.path.join(parent, "data"), img_id) for img_id in train_image_paths ] val_image_paths = [ os.path.join(os.path.join(parent, "data"), img_id) for img_id in val_image_paths ] targets = {col: df[col].values for col in df.columns.tolist()[1:-1]} aug = A.Compose([ A.Normalize( env_dict["MODEL_MEAN"], env_dict["MODEL_STD"], max_pixel_value=255.0, always_apply=True, ), A.CenterCrop(100, 100), A.RandomCrop(80, 80), A.HorizontalFlip(p=0.5), A.Rotate(limit=(-90, 90)), A.VerticalFlip(p=0.5), A.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) train_dataset = ClassificationDataset( image_paths=train_image_paths, targets=targets, resize=(env_dict["IMG_HEIGHT"], env_dict["IMG_WIDTH"]), augmentations=aug, ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=env_dict["TRAIN_BATCH_SIZE"], shuffle=True, num_workers=4, ) valid_dataset = ClassificationDataset( image_paths=val_image_paths, targets=targets, resize=(env_dict["IMG_HEIGHT"], env_dict["IMG_WIDTH"]), augmentations=aug, ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=env_dict["VALID_BATCH_SIZE"], shuffle=False, num_workers=4, ) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", patience=5, factor=0.4, verbose=True) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) for epoch in range(env_dict["EPOCHS"]): train(train_dataset, train_data_loader, env_dict, model, optimizer) val_score = evaluate(valid_dataset, valid_data_loader, env_dict, model) scheduler.step(val_score) print(f"EPOCH: {epoch}, validation error: {val_score}") torch.save( model.state_dict(), os.path.join( parent, f"models/{env_dict['BASE_MODEL']}_fold{env_dict['VALIDATION_FOLDS'][0]}.bin", ), )
torch.save(detector.state_dict(), DETECTOR_PATH) else: torch.save(detector.state_dict(), args.path_resume) #Evaluation if num_epochs_detection == 0: loss, accuracy = classificator.evaluate(test_data, test_label, verbose=1) predict_test = classificator(test_data) prediction = np.argmax(predict_test, axis=1) from sklearn.metrics import confusion_matrix cm = confusion_matrix(non_cat, prediction) STYLES_HOTONE_ENCODE = {'M': 0, 'G': 1, 'R': 2, 'B': 3} from sklearn.metrics import ConfusionMatrixDisplay import matplotlib.pyplot as plt disp = ConfusionMatrixDisplay( confusion_matrix=cm, display_labels=STYLES_HOTONE_ENCODE.keys()) disp.plot(include_values=True, cmap='viridis') plt.savefig('confmatDeLDECAS.png') shap_values_test = explainer.shap_values(test_data, nsamples=30, l1_reg='bic') #Compute GED based on shap d = GED_metric(test_data, shap_values_test, dataset=data) print(d) print(accuracy) if j < num_epochs_detection or num_epochs_detection == 0: evaluate(detector, test_loader, device="cuda") shutil.rmtree(TMP_PATH)
def main(): """ main """ config = model_config() if config.check: config.save_dir = "./tmp/" config.use_gpu = torch.cuda.is_available() and config.gpu >= 0 device = config.gpu torch.cuda.set_device(device) # Data definition corpus = KnowledgeCorpus(data_dir=config.data_dir, data_prefix=config.data_prefix, min_freq=0, max_vocab_size=config.max_vocab_size, min_len=config.min_len, max_len=config.max_len, embed_file=config.embed_file, with_label=config.with_label, share_vocab=config.share_vocab) corpus.load() if config.test and config.ckpt: corpus.reload(data_type='test') train_iter = corpus.create_batches( config.batch_size, "train", shuffle=True, device=device) valid_iter = corpus.create_batches( config.batch_size, "valid", shuffle=False, device=device) test_iter = corpus.create_batches( config.batch_size, "test", shuffle=False, device=device) # Model definition model = KnowledgeSeq2Seq(src_vocab_size=corpus.SRC.vocab_size, tgt_vocab_size=corpus.TGT.vocab_size, embed_size=config.embed_size, hidden_size=config.hidden_size, padding_idx=corpus.padding_idx, num_layers=config.num_layers, bidirectional=config.bidirectional, attn_mode=config.attn, with_bridge=config.with_bridge, tie_embedding=config.tie_embedding, dropout=config.dropout, use_gpu=config.use_gpu, use_bow=config.use_bow, use_dssm=config.use_dssm, use_pg=config.use_pg, use_gs=config.use_gs, pretrain_epoch=config.pretrain_epoch, use_posterior=config.use_posterior, weight_control=config.weight_control, concat=config.decode_concat) model_name = model.__class__.__name__ # Generator definition generator = TopKGenerator(model=model, src_field=corpus.SRC, tgt_field=corpus.TGT, cue_field=corpus.CUE, max_length=config.max_dec_len, ignore_unk=config.ignore_unk, length_average=config.length_average, use_gpu=config.use_gpu) # Interactive generation testing if config.interact and config.ckpt: model.load(config.ckpt) return generator # Testing elif config.test and config.ckpt: print(model) model.load(config.ckpt) print("Testing ...") metrics, scores = evaluate(model, test_iter) print(metrics.report_cum()) print("Generating ...") evaluate_generation(generator, test_iter, save_file=config.gen_file, verbos=True) else: # Load word embeddings if config.use_embed and config.embed_file is not None: model.encoder.embedder.load_embeddings( corpus.SRC.embeddings, scale=0.03) model.decoder.embedder.load_embeddings( corpus.TGT.embeddings, scale=0.03) # Optimizer definition optimizer = getattr(torch.optim, config.optimizer)( model.parameters(), lr=config.lr) # Learning rate scheduler if config.lr_decay is not None and 0 < config.lr_decay < 1.0: lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=config.lr_decay, patience=1, verbose=True, min_lr=1e-5) else: lr_scheduler = None # Save directory date_str, time_str = datetime.now().strftime("%Y%m%d-%H%M%S").split("-") result_str = "{}-{}".format(model_name, time_str) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) # Logger definition logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG, format="%(message)s") fh = logging.FileHandler(os.path.join(config.save_dir, "train.log")) logger.addHandler(fh) # Save config params_file = os.path.join(config.save_dir, "params.json") with open(params_file, 'w') as fp: json.dump(config.__dict__, fp, indent=4, sort_keys=True) print("Saved params to '{}'".format(params_file)) logger.info(model) # Train logger.info("Training starts ...") trainer = Trainer(model=model, optimizer=optimizer, train_iter=train_iter, valid_iter=valid_iter, logger=logger, generator=generator, valid_metric_name="-loss", num_epochs=config.num_epochs, save_dir=config.save_dir, log_steps=config.log_steps, valid_steps=config.valid_steps, grad_clip=config.grad_clip, lr_scheduler=lr_scheduler, save_summary=False) if config.ckpt is not None: trainer.load(file_prefix=config.ckpt) trainer.train() logger.info("Training done!") # Test logger.info("") trainer.load(os.path.join(config.save_dir, "best")) logger.info("Testing starts ...") metrics, scores = evaluate(model, test_iter) logger.info(metrics.report_cum()) logger.info("Generation starts ...") test_gen_file = os.path.join(config.save_dir, "test.result") evaluate_generation(generator, test_iter, save_file=test_gen_file, verbos=True)