def main(opt): logger = src.util.init_logger(is_main=True) tokenizer = transformers.BertTokenizerFast.from_pretrained( 'bert-base-uncased') model_class = src.model.Retriever #model, _, _, _, _, _ = src.util.load(model_class, opt.model_path, opt) model = model_class.from_pretrained(opt.model_path) model.eval() model = model.to(opt.device) if not opt.no_fp16: model = model.half() passages = src.util.load_passages(args.passages) shard_size = len(passages) // args.num_shards start_idx = args.shard_id * shard_size end_idx = start_idx + shard_size if args.shard_id == args.num_shards - 1: end_idx = len(passages) passages = passages[start_idx:end_idx] logger.info( f'Embedding generation for {len(passages)} passages from idx {start_idx} to {end_idx}' ) allids, allembeddings = embed_passages(opt, passages, model, tokenizer) output_path = Path(args.output_path) save_file = output_path.parent / (output_path.name + f'_{args.shard_id:02d}') output_path.parent.mkdir(parents=True, exist_ok=True) logger.info(f'Saving {len(allids)} passage embeddings to {save_file}') with open(save_file, mode='wb') as f: pickle.dump((allids, allembeddings), f) logger.info( f'Total passages processed {len(allids)}. Written to {save_file}.')
global_rank, #use the global rank and world size attibutes to split the eval set on multiple gpus world_size=opt.world_size) eval_dataset = src.data.Dataset( eval_examples, opt.n_context, ) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=opt.per_gpu_batch_size, num_workers=20, collate_fn=collator_function) model_class = src.model.FiDT5 model = model_class.from_pretrained(opt.model_path) model = model.to(opt.device) logger.info("Start eval") exactmatch, total = evaluate(model, eval_dataset, eval_dataloader, tokenizer, opt) logger.info(f'EM {100*exactmatch:.2f}, Total number of example {total}') if opt.write_results and opt.is_main: glob_path = Path(opt.checkpoint_dir) / opt.name / 'test_results' write_path = Path(opt.checkpoint_dir) / opt.name / 'final_output.json' src.util.write_output(glob_path, write_path) if opt.write_crossattention_scores: src.util.save_distributed_dataset(eval_dataset.data, opt)
device_str = "cuda:0" if torch.cuda.is_available() else "cpu" print("Device: %s\n" % device_str) device = torch.device(device_str) # Hyperparameter for Cutmix cutmix_beta = 0.3 # Hyperparameter epochs = 100 lr = 0.01 train_loader, valid_loader = data.load_data(batch_size=64) print("Train samples: %d" % len(train_loader.dataset)) print("Valid samples: %d" % len(valid_loader.dataset)) model = model.model() model = model.to(device) criterion_lss1 = nn.BCELoss() criterion_lss2 = nn.KLDivLoss(reduction='batchmean') criterion_ce = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr) time_str = time.strftime("%m_%d-%Hh%Mm%Ss", time.localtime()) file = open("../log/%s.csv" % time_str, 'w') writer = csv.writer(file) headers = [ "train_loss", "train_acc", "train_lsl", "train_lss_1", "train_lss_2", "train_lsd", "valid_loss", "valid_acc", "valid_lsl", "valid_lss_1", "valid_lss_2", "valid_lsd" ]
def run(args): ### Data Loading if args.task == 0: print('Task 0: MR Dataset Prediction') augmentor = transforms.Compose([ transforms.Lambda(lambda x: torch.Tensor(x)), mrnet.torchsample.transforms.RandomRotate(25), mrnet.torchsample.transforms.RandomTranslate([0.11, 0.11]), mrnet.torchsample.transforms.RandomFlip(), transforms.Lambda( lambda x: x.repeat(3, 1, 1, 1).permute(1, 0, 2, 3)), ]) job = 'acl' plane = 'sagittal' train_ds = mrnet.mrnet_dataloader.MRDataset( '/data/larson2/RCC_dl/MRNet-v1.0/data/', job, plane, transform=augmentor, train=True) train_loader = torch.utils.data.DataLoader(train_ds, batch_size=1, shuffle=True, num_workers=11, drop_last=False) val_ds = mrnet.mrnet_dataloader.MRDataset( '/data/larson2/RCC_dl/MRNet-v1.0/data/', job, plane, train=False) val_loader = torch.utils.data.DataLoader(val_ds, batch_size=1, shuffle=- True, num_workers=11, drop_last=False) elif args.task == 1: print('Task 1: clear cell grade prediction') path = '/data/larson2/RCC_dl/new/clear_cell/' augmentor = transforms.Compose([ transforms.Lambda(lambda x: torch.Tensor(x)), src.dataloader.Rescale(-160, 240), # rset dynamic range transforms.Lambda( lambda x: x.repeat(3, 1, 1, 1).permute(3, 0, 1, 2)), # src.dataloader.Normalize(), # src.dataloader.Crop(90), # src.dataloader.RandomCenterCrop(90), src.dataloader.RandomHorizontalFlip(), src.dataloader.RandomRotate(25), src.dataloader.Resize(256), ]) augmentor2 = transforms.Compose([ transforms.Lambda(lambda x: torch.Tensor(x)), src.dataloader.Rescale(-160, 240), # rset dynamic range transforms.Lambda( lambda x: x.repeat(3, 1, 1, 1).permute(3, 0, 1, 2)), # src.dataloader.Normalize(), # src.dataloader.Crop(90), src.dataloader.Resize(256), ]) train_ds = src.dataloader.RCCDataset_h5(path, mode='train', transform=augmentor) train_loader = DataLoader(train_ds, batch_size=1, shuffle=True, num_workers=1, drop_last=False) val_ds = src.dataloader.RCCDataset_h5(path, mode='val', transform=augmentor2) val_loader = DataLoader(val_ds, batch_size=1, shuffle=True, num_workers=1, drop_last=False) print(f'train size: {len(train_loader)}') print(f'val size: {len(val_loader)}') pos_weight = args.weight ### Some Checkers print('Summary: ') print(f'\ttrain size: {len(train_loader)}') print(f'\tval size: {len(val_loader)}') print('\tDatatype = ', train_ds[1][0].dtype) print('\tMin = ', train_ds[1][0].min()) print('\tMax = ', train_ds[1][0].max()) print('\tInput size', train_ds[0][0].shape) print('\tweight = ', args.weight) ### Some trackers log_root_folder = "/data/larson2/RCC_dl/logs/" now = datetime.now() now = now.strftime("%Y%m%d-%H%M%S") logdir = os.path.join( log_root_folder, f"task_{args.task}_{args.prefix_name}_model{args.model}_{now}") os.makedirs(logdir) print(f'logdir = {logdir}') writer = SummaryWriter(logdir) ### Model Construction ## Select Model if args.model == 1: model = src.model.MRNet() elif args.model == 2: model = src.model.MRNet2() elif args.model == 3: model = src.model.MRNetBN() elif args.model == 4: model = src.model.MRResNet() elif args.model == 5: model = src.model.MRNetScratch() elif args.model == 6: model = src.model.TDNet() else: print('Invalid model name') return ## Weight Initialization ## Training Stretegy device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") print('\tCuda:', torch.cuda.is_available(), f'\n\tdevice = {device}') optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.1) if args.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif args.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=args.gamma) model = model.to(device) ### Ready? best_val_loss = float('inf') best_val_auc = float(0) iteration_change_loss = 0 t_start_training = time.time() ### Here we go for epoch in range(args.epochs): current_lr = src.train3d.get_lr(optimizer) t_start = time.time() train_loss, train_auc = src.train3d.train_model( model, train_loader, device, epoch, args.epochs, optimizer, writer, current_lr, args.log_every, args.weight) val_loss, val_auc = src.train3d.evaluate_model( model, val_loader, device, epoch, args.epochs, writer, current_lr, args.log_every, ) if args.lr_scheduler == 'plateau': scheduler.step(val_loss) elif args.lr_scheduler == 'step': scheduler.step() t_end = time.time() delta = t_end - t_start print( "train loss : {0} | train auc {1} | val loss {2} | val auc {3} | elapsed time {4} s" .format(train_loss, train_auc, val_loss, val_auc, delta)) iteration_change_loss += 1 print('-' * 30) model_root_dir = "/data/larson2/RCC_dl/models/" if val_auc > best_val_auc: best_val_auc = val_auc if bool(args.save_model): file_name = f'task_{args.task}_model_{args.model}_{args.prefix_name}_val_auc_{val_auc:0.4f}_train_auc_{train_auc:0.4f}_epoch_{epoch+1}_weight_{args.weight}_lr_{args.lr}_gamma_{args.gamma}_lrsche_{args.lr_scheduler}.pth' # for f in os.listdir(model_root_dir): # if (args.prefix_name in f): # os.remove(os.path.join(model_root_dir, f)) torch.save(model, os.path.join(model_root_dir, file_name)) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if iteration_change_loss == args.patience: print( 'Early stopping after {0} iterations without the decrease of the val loss' .format(iteration_change_loss)) break t_end_training = time.time() print(f'training took {t_end_training - t_start_training} s')
def run(args): print('Task 1: clear cell grade prediction') path = '/data/larson2/RCC_dl/new/clear_cell/' transform = { 'train': transforms.Compose([ transforms.Lambda(lambda x: torch.Tensor(x)), src.dataloader.Rescale(-160, 240, zero_center=True), # rset dynamic range transforms.Lambda( lambda x: x.repeat(3, 1, 1, 1).permute(3, 0, 1, 2)), # src.dataloader.Normalize(), # src.dataloader.Crop(110), # src.dataloader.RandomCenterCrop(90), src.dataloader.RandomHorizontalFlip(), # src.dataloader.RandomRotate(25), src.dataloader.Resize(256) ]), 'val': transforms.Compose([ transforms.Lambda(lambda x: torch.Tensor(x)), src.dataloader.Rescale(-160, 240, zero_center=True), # rset dynamic range transforms.Lambda( lambda x: x.repeat(3, 1, 1, 1).permute(3, 0, 1, 2)), # src.dataloader.Normalize(), # src.dataloader.Crop(90), src.dataloader.Resize(256) ]) } my_dataset = { 'train': src.dataloader.RCCDataset_h5(path, mode='train', transform=transform['train']), 'val': src.dataloader.RCCDataset_h5(path, mode='val', transform=transform['train']) } my_loader = { x: DataLoader(my_dataset[x], batch_size=1, shuffle=True, num_workers=4) for x in ['train', 'val'] } print('train size: ', len(my_loader['train'])) print('train size: ', len(my_loader['val'])) ### Some Checkers print('Summary: ') print('\ttrain size: ', len(my_loader['train'])) print('\ttrain size: ', len(my_loader['val'])) print('\tDatatype = ', next(iter(my_loader['train']))[0].dtype) print('\tMin = ', next(iter(my_loader['train']))[0].min()) print('\tMax = ', next(iter(my_loader['train']))[0].max()) print('\tInput size', next(iter(my_loader['train']))[0].shape) # print('\tweight = ', args.weight) ### Tensorboard Log Setup log_root_folder = "/data/larson2/RCC_dl/logs/" now = datetime.now() now = now.strftime("%Y%m%d-%H%M%S") logdir = os.path.join( log_root_folder, f"{now}_model_{args.model}_{args.prefix_name}_epoch_{args.epochs}_weight_{args.weight}_lr_{args.lr}_gamma_{args.gamma}_lrsche_{args.lr_scheduler}_{now}" ) # os.makedirs(logdir) print(f'\tlogdir = {logdir}') writer = SummaryWriter(logdir) ### Model Selection device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") model = src.model.TDNet() model = model.to(device) writer.add_graph(model, my_dataset['train'][0][0].to(device)) print('\tCuda:', torch.cuda.is_available(), f'\n\tdevice = {device}') optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.1) if args.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif args.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=args.gamma) pos_weight = torch.FloatTensor([args.weight]).to(device) criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) ### Ready? best_val_loss = float('inf') best_val_auc = float(0) best_model_wts = copy.deepcopy(model.state_dict()) iteration_change_loss = 0 t_start_training = time.time() ### Here we go for epoch in range(args.epochs): current_lr = get_lr(optimizer) t_start = time.time() epoch_loss = {'train': 0., 'val': 0.} epoch_corrects = {'train': 0., 'val': 0.} epoch_acc = 0.0 epoch_AUC = 0.0 for phase in ['train', 'val']: if phase == 'train': if args.lr_scheduler == "step": scheduler.step() model.train() else: model.eval() running_losses = [] running_corrects = 0. y_trues = [] y_probs = [] y_preds = [] print('lr: ', current_lr) for i, (inputs, labels, header) in enumerate(my_loader[phase]): optimizer.zero_grad() inputs = inputs.to(device) labels = labels.to(device) # forward # track history only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs.float()) # raw logits probs = torch.sigmoid( outputs) # [0, 1] probability, shape = s * 1 preds = torch.round( probs ) # 0 or 1, shape = s * 1, prediction for each slice pt_pred, _ = torch.mode( preds, 0 ) # take majority vote, shape = 1, prediction for each patient count0 = (preds == 0).sum().float() count1 = (preds == 1).sum().float() pt_prob = count1 / (preds.shape[0]) # convert label to slice level loss = criterion(outputs, labels.repeat( inputs.shape[1], 1)) # inputs shape = 1*s*3*256*256 # backward + optimize only if in training phases if phase == 'train': loss.backward() optimizer.step() # multiple loss by slice num per batch? running_losses.append(loss.item()) # * inputs.size(0) running_corrects += torch.sum(preds == labels.data) y_trues.append(int(labels.item())) y_probs.append(pt_prob.item()) # use ratio to get probability y_preds.append(pt_pred.item()) writer.add_scalar(f'{phase}/Loss', loss.item(), epoch * len(my_loader[phase]) + i) writer.add_pr_curve('{phase}pr_curve', y_trues, y_probs, 0) if (i % args.log_every == 0) & (i > 0): print( 'Epoch: {0}/{1} | Single batch number : {2}/{3} | avg loss:{4} | Acc: {5:.4f} | lr: {6}' .format(epoch + 1, args.epochs, i, len(my_loader[phase]), np.round(np.mean(running_losses), 4), (running_corrects / len(my_loader[phase])), current_lr)) # epoch statistics epoch_loss[phase] = np.round(np.mean(running_losses), 4) epoch_corrects[phase] = (running_corrects / len(my_loader[phase])) cm = confusion_matrix(y_trues, y_preds, labels=[0, 1]) src.helper.print_cm(cm, ['0', '1']) sens, spec, acc = src.helper.compute_stats(y_trues, y_preds) print('sens: {:.4f}'.format(sens)) print('spec: {:.4f}'.format(spec)) print('acc: {:.4f}'.format(acc)) print() print( '\ Summary train loss: {0} | val loss: {1} | train acc: {2:.4f} | val acc: {3:.4f}' .format(epoch_loss['train'], epoch_loss['val'], epoch_corrects['train'], epoch_corrects['val'])) print('-' * 30)
) train_dataset = src.data.Dataset(train_examples, opt.n_context) # use golbal rank and world size to split the eval set on multiple gpus eval_examples = src.data.load_data( opt.eval_data, global_rank=opt.global_rank, world_size=opt.world_size, maxload=opt.maxload ) eval_dataset = src.data.Dataset(eval_examples, opt.n_context) if not checkpoint_exists and opt.model_path == "none": t5 = transformers.T5ForConditionalGeneration.from_pretrained(model_name) model = src.model.FiDT5(t5.config) model.load_t5(t5.state_dict()) model = model.to(opt.local_rank) optimizer, scheduler = src.util.set_optim(opt, model) step, best_dev_em = 0, 0.0 elif opt.model_path == "none": load_path = checkpoint_path / 'checkpoint' / 'latest' model, optimizer, scheduler, opt_checkpoint, step, best_dev_em = \ src.util.load(model_class, load_path, opt, reset_params=False) logger.info(f"Model loaded from {load_path}") else: model, optimizer, scheduler, opt_checkpoint, step, best_dev_em = \ src.util.load(model_class, opt.model_path, opt, reset_params=True) logger.info(f"Model loaded from {opt.model_path}") model.set_checkpoint(opt.use_checkpoint) if opt.is_distributed:
def evaluate_model( model, val_loader, device, epoch, num_epochs, writer, current_lr, log_every=20, ): _ = model.eval() model = model.to(device) y_trues = [] y_logits = [] y_probs = [] y_preds = [] loss_values = [] criterion = torch.nn.BCEWithLogitsLoss() for i, (image, label, header) in enumerate(val_loader): image = image.to(device) label = label.to(device) outputs = model(image.float()) loss = criterion(outputs, label) probs = torch.sigmoid(outputs) preds = torch.round(probs) loss_values.append(loss.item()) y_trues.append(int(label.item())) y_logits.append(outputs.item()) y_probs.append(probs.item()) y_preds.append(preds.item()) try: auc = metrics.roc_auc_score(y_trues, y_probs) except: auc = 0.5 writer.add_scalar('Val/Loss', loss.item(), epoch * len(val_loader) + i) writer.add_scalar('Val/AUC', auc, epoch * len(val_loader) + i) if (i % log_every == 0) & (i > 0): print( '''[Epoch: {0} / {1} |Single batch number : {2} / {3} ] | avg val loss {4} | val auc : {5} | lr : {6}''' .format(epoch + 1, num_epochs, i, len(val_loader), np.round(np.mean(loss_values), 4), np.round(auc, 4), current_lr)) cm = confusion_matrix(y_trues, y_preds, labels=[0, 1]) print_cm(cm, ['0', '1']) sens, spec, acc = compute_stats(y_trues, y_preds) print('sens: {:.4f}'.format(sens)) print('spec: {:.4f}'.format(spec)) print('acc: {:.4f}'.format(acc)) print() writer.add_scalar('Val/AUC_epoch', auc, epoch + i) val_loss_epoch = np.round(np.mean(loss_values), 4) val_auc_epoch = np.round(auc, 4) return val_loss_epoch, val_auc_epoch
def train_model(model, train_loader, device, epoch, num_epochs, optimizer, writer, current_lr, log_every=100, weight=1): _ = model.train() model = model.to(device) y_trues = [] y_logits = [] y_probs = [] y_preds = [] loss_values = [] pos_weight = torch.FloatTensor([weight]).to(device) criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) for i, (image, label, header) in enumerate(train_loader): optimizer.zero_grad() image = image.to(device) label = label.to(device) outputs = model(image.float()) loss = criterion(outputs, label) loss.backward() optimizer.step() probs = torch.sigmoid(outputs) preds = torch.round(probs) loss_values.append(loss.item()) y_trues.append(int(label.item())) y_logits.append(outputs.item()) y_probs.append(probs.item()) y_preds.append(preds.item()) try: auc = metrics.roc_auc_score(y_trues, y_probs) except: auc = 0.5 writer.add_scalar('Train/Loss', loss.item(), epoch * len(train_loader) + i) writer.add_scalar('Train/AUC', auc, epoch * len(train_loader) + i) if (i % log_every == 0) & (i > 0): print( '''[Epoch: {0} / {1} |Single batch number : {2} / {3} ]| avg train loss {4} | train auc : {5} | lr : {6}''' .format(epoch + 1, num_epochs, i, len(train_loader), np.round(np.mean(loss_values), 4), np.round(auc, 4), current_lr)) cm = confusion_matrix(y_trues, y_preds, labels=[0, 1]) print_cm(cm, ['0', '1']) sens, spec, acc = compute_stats(y_trues, y_preds) print('sens: {:.4f}'.format(sens)) print('spec: {:.4f}'.format(spec)) print('acc: {:.4f}'.format(acc)) print() writer.add_scalar('Train/AUC_epoch', auc, epoch + i) train_loss_epoch = np.round(np.mean(loss_values), 4) train_auc_epoch = np.round(auc, 4) return train_loss_epoch, train_auc_epoch