def main(): args = get_args() rng = np.random.RandomState(1223) # Get context from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) iterations = [] mean_iou = [] model_dir = args.model_load_path for filename in os.listdir(model_dir): args.model_load_path = model_dir + filename miou = eval.validate(args) iterations.append(filename.split('.')[0]) mean_iou.append(miou) for i in range(len(iterations)): iterations[i] = iterations[i].replace('param_', '') itr = list(map(int, iterations)) # Plot Iterations Vs mIOU plt.axes([0, max(itr), 0.0, 1.0]) plt.xlabel('Iterations') plt.ylabel('Accuracy - mIOU') plt.scatter(itr, mean_iou) plt.show() print(iterations) print(mean_iou) with open('iterations.txt', 'w') as f: for item in iterations: f.write('%s\n' % item) with open('miou.txt', 'w') as f2: for item in mean_iou: f2.write('%s\n' % item)
def train_kgatt(args: Args, kg_train: KnowledgeGraph, kg_test: KnowledgeGraph, kg_val: KnowledgeGraph, total_triplets=None): n_ent, n_rel = kg_train.n_ent, kg_train.n_rel if total_triplets is None: total_triplets = get_valid_triplets(kg_train, kg_test, kg_val) dataloader = DataLoader(kg_train, batch_size=args.batch_size, shuffle=False, pin_memory=cuda.is_available()) model = MultiHeadKGAtt(n_ent, n_rel, 100, 200, 100, args.num_heads, device=args.device).to(args.device) params = model.parameters() if args.optimizer == 'adam': optimizer = Adam(model.parameters(), lr=args.lr, eps=1e-3) elif args.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=args.lr) elif args.optimizer == 'adamw': optimizer = AdamW(model.parameters(), lr=args.lr, eps=1e-3) ent_embed, rel_embed = get_init_embed() ent_embed, rel_embed = ent_embed.to(args.device), rel_embed.to(args.device) loss = 0 model.train() for epoch in range(args.n_epochs): losses = [] ent_embeds = [0] rel_embeds = [0] for i, batch in enumerate(dataloader): triplets = torch.stack(batch) triplets, labels, nodes, edges = negative_sampling( triplets, n_ent, args.negative_rate) triplets, labels = triplets.to(args.device), labels.to(args.device) model.zero_grad() # start = time.time() model.train() ent_embed_, rel_embed_ = model(triplets, ent_embed, rel_embed, nodes, edges) loss = loss_func2(triplets, args.negative_rate, ent_embed_, rel_embed_, device=args.device) # loss.backward(retain_graph=True) loss.backward() optimizer.step() # print(f"Finished {time.time() - start}") losses.append(loss.item()) ent_embeds[0] = ent_embed_ rel_embeds[0] = rel_embed_ # if i % 100 == 0: # print(loss.item()) # print(loss.item()) loss = sum(losses) / (len(losses)) print(f'Epoch {epoch} Loss: {loss}') # writer.add_scalar("Train Loss", loss, epoch) if epoch > 10: model.eval() validate(model, kg_val, total_triplets, 100, 'cuda') return loss
def train_net(model: UNet3D, device, loss_fnc=DiceLoss(sigmoid_normalization=False), eval_criterion=MeanIoU(), epochs=5, batch_size=1, learning_rate=0.0002, val_percent=0.04, test_percent=0.1, name='U-Net', save_cp=True, tests=None): data_set = BasicDataset(dir_img, dir_mask, 'T1', device) train_loader, val_loader, test_loader = data_set.split_to_loaders( val_percent, test_percent, batch_size, test_files=tests) writer = SummaryWriter(comment=f'LR_{learning_rate}_BS_{batch_size}') global_step = 0 logging.info(f'''Starting {name} training: Epochs: {epochs} Batch size: {batch_size} Learning rate: {learning_rate} Training size: {len(train_loader)} Validation size: {len(val_loader)} Testing size: {len(test_loader)} Checkpoints: {save_cp} Device: {device.type} ''') optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.00001) losses = [] val_scores = [] for epoch in range(epochs): epoch_loss = 0 for batch in train_loader: model.train() start_time = timeit.default_timer() img = batch['image'] mask = batch['mask'] masks_pred = model(img) loss = loss_fnc(masks_pred, mask) epoch_loss += loss.item() losses.append(loss.item()) writer.add_scalar('Loss/train', loss.item(), global_step) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 elapsed = timeit.default_timer() - start_time logging.info( f'I: {global_step}, Loss: {loss.item()} in {elapsed} seconds') if global_step % (len(train_loader) // (5 * batch_size)) == 0: val_score = validate(model, val_loader, loss_fnc, eval_criterion) val_scores.append(val_score) writer.add_scalar('Validation/test', val_score, global_step) if save_cp: try: os.mkdir(dir_checkpoint) logging.info('Created checkpoint directory') except OSError: pass torch.save(model.state_dict(), dir_checkpoint + f'{name}_epoch{epoch + 1}.pth') logging.info(f'Epoch: {epoch + 1} Loss: {epoch_loss}') logging.info(f'Checkpoint {epoch + 1} saved !') plot_cost(losses, name='Loss' + str(epoch), model_name=name) plot_cost(val_scores, name='Validation' + str(epoch), model_name=name) writer.close()
def train(loaders, dist, args): # use checkpoint model if given if args.m is None: checkpoint = torch.load(args.checkpoint) model_name = checkpoint['name'] model = Model(model_name) model.load_state_dict(checkpoint['model_state_dict']) else: # else init model model_name = args.m model = Model(model_name) # loss and device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") dist = torch.FloatTensor(dist).to( device ) # no epsilon needs to be added, each category has at least one sample if args.wl: criterion = nn.CrossEntropyLoss() else: criterion = nn.CrossEntropyLoss(weight=1 / dist) data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) path_to_best_model = "" # learning rate optimizer = optimizer = optim.Adadelta(model.parameters(), lr=args.lr, rho=0.95, eps=1e-08) best_loss = sys.maxsize early_stop = False # epochs iternum = 1 for epoch in range(args.epoch_num): epoch_loss = 0 num_corrects = 0 tbar = tqdm(loaders['train']) # iterate through images for i, (imgs, labels) in enumerate(tbar): model.train() imgs, labels = imgs.to(device), labels.to(device) optimizer.zero_grad() outputs = model(imgs) _, preds = torch.max(outputs, 1) num_corrects += torch.sum(preds == labels.data) loss = criterion(outputs, labels) epoch_loss += loss.item() loss.backward() optimizer.step() # current training accuracy of epoch epoch_acc = num_corrects.double() / ((i + 1) * args.batch_size) tbar.set_description( 'Epoch: [{}/{}], Epoch_loss: {:.5f}, Epoch_acc: {:.5f}'.format( epoch + 1, args.epoch_num, epoch_loss / (i + 1), epoch_acc)) # early stopping if iternum % args.num_iter_to_validate == 0: print("Validating model ...") if epoch > args.num_iter_to_validate: print('Best validation loss: {}'.format(best_loss)) val_loss, val_acc = validate(loaders['val'], model, device) # if we have the best model so far if val_loss < best_loss: best_loss = val_loss path_to_checkpoint = os.path.abspath( os.path.join(args.checkpoint, f'model_{model_name}_epoch_{epoch}.pth')) if path_to_best_model: os.remove(path_to_best_model) path_to_best_model = path_to_checkpoint num_checks = 0 state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() torch.save( { 'model_state_dict': state_dict, 'model_name': model_name }, path_to_checkpoint) else: # else we increase patience, if patience reaches the limit we stop num_checks += 1 if num_checks >= args.patience: print("Early stopping ...") early_stop = True print( 'Validation loss: {}\n Validation acc: {}'.format( val_loss, val_acc), 'Number of checks: {}'.format(num_checks)) if early_stop: break iternum += 1 return model
def train(encoder, decoder, train_loader, val_loader, optimizer, criterion, id2word, lr_scheduler=None, num_epochs=1, print_every=100, device='cpu', early_stop=False): """ Function for training Inputs: - encoder, decoder - train_loader, val_loader: DataLoader for training set and validation set - optimizer: a torch.optim optimizer (e.g. torch.optim.Adam(...)) - criterion: loss function (e.g. nn.CrossEntropyLoss()) - id2word: id2word for target training set - lr_scheduler: learning rate scheduler (e.g. torch.optim.lr_scheduler.StepLR) - num_epochs - print_every - device: 'cpu' or 'cuda' """ encoder.train() decoder.train() best_bleu = 0 best_statedict = { 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict() } for epoch in range(num_epochs): print('Epoch ', epoch + 1) for i, (x, y) in enumerate(train_loader): x = x.to(device=device, dtype=torch.long) y = y.to(device=device, dtype=torch.long) enc_out, enc_hidden = encoder(x) dec_hidden = enc_hidden dec_input = y[:, 0] loss = 0 optimizer.zero_grad() for t in range(1, y.size(1)): out, dec_hidden = decoder(dec_input, dec_hidden, enc_out) dec_input = y[:, t] loss += criterion(out.squeeze(1), y[:, t]) loss.backward() optimizer.step() if i % print_every == 0: print('Iter %d, loss = %f' % (i, loss.item() / y.size(1))) if lr_scheduler != None: lr_scheduler.step() bleu = validate(val_loader, encoder, decoder, id2word, device) print('Validation BLEU score: %f\n' % bleu) if bleu > best_bleu: best_statedict = { 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict() } best_bleu = bleu elif early_stop: print('=== BLEU begins to decrease, training exits ===') return best_statedict return best_statedict
def train_net(model: UNet3D, epochs=5, learning_rate=0.0002, val_percent=0.1, test_percent=0.1, name='U-Net', tests=None, patch_size=16, testing_memory=False, mask_model=False): data_set = BrainDataset(dir_img, 'T1', dir_mask, stack_size=patch_size, mask_net=mask_model) loader = BrainLoaders(data_set, ratios=[val_percent, test_percent], files=[None, tests]) train_loader = loader.train_loader() val_loader = loader.validation_loader() test_loader = loader.test_loader() num_images = data_set.num_files() log_interval = len(train_loader) if num_images < 10 else len( data_set.slices) * (num_images // 10) global_step = 0 logging.info(f'''Starting {name} training: Epochs: {epochs} Learning rate: {learning_rate} Training size: {len(train_loader)} slices Validation size: {len(val_loader)} images Testing size: {len(test_loader)} images Log Interval {log_interval} ''') optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.00001) losses = [] val_scores = {} for fnc in METRICS: val_scores[fnc] = [] for epoch in range(epochs): epoch_loss = 0 epoch_start_time = timeit.default_timer() log_start_time = timeit.default_timer() log_loss = RunningAverage() for batch in train_loader: model.train() img = batch['image'] mask = batch['mask'] masks_pred = model(img) loss = loss_fnc(masks_pred, mask) epoch_loss += loss.item() log_loss.update(loss.item(), n=1) optimizer.zero_grad() loss.backward() optimizer.step() if testing_memory: # When testing patch sizes only one iteration is enough return global_step += 1 if global_step % log_interval == 0: elapsed = timeit.default_timer() - log_start_time losses.append(log_loss.avg) logging.info( f'I: {global_step}, Avg. Loss: {log_loss.avg} in {elapsed} seconds' ) log_start_time = timeit.default_timer() log_loss = RunningAverage() scores = validate(model, loader, is_validation=True, loss_fnc=loss_fnc) for fnc in METRICS: val_scores[fnc].append(scores[fnc]) make_dir(dir_checkpoint) torch.save(model.state_dict(), dir_checkpoint + f'{name}_epoch{epoch + 1}.pth') elapsed = timeit.default_timer() - epoch_start_time logging.info( f'Epoch: {epoch + 1} Total Loss: {epoch_loss} in {elapsed} seconds' ) logging.info(f'Checkpoint {epoch + 1} saved !') plot_cost(losses, name='Loss', model_name=name + str(epoch) + '_') for fnc in METRICS: plot_cost(val_scores[fnc], name='Validation_' + type(fnc).__name__, model_name=name + str(epoch) + '_') logging.info('Starting Testing') validate(model, loader, is_validation=False, loss_fnc=loss_fnc, quiet=False)
def _eval_model(self, model: Model, writer: SummaryWriter, step, t, eval_title, results_dict): training = model.training model.eval() if t in self.config['schedule_simple']: t_idx = self.config['schedule_simple'].index(t) else: t_idx = len(self.config['schedule_simple']) - 1 # for calculating total performance targets_total = [] probs_total = [] # Accuracy of each subset for order_i, t_i in enumerate(self.config['schedule_simple'][:t_idx + 1]): subset_name = t_i last_id = self.config['schedule_simple'][ -1] # XXX should be -1. -2 for debugging. subset = self.subsets[t_i] data = DataLoader( subset, batch_size=self.config['eval_batch_size'], num_workers=self.config['eval_num_workers'], collate_fn=self.collate_fn, ) # results is dict. {method: group_averagemeter_object} results, targets, probs = validate(subset_name, model, data, self.category_map, results_dict, last_id, self.split_cats_dict) targets_total.append(targets) probs_total.append(probs) if subset_name in results_dict: results_dict[subset_name].append(results) else: results_dict[subset_name] = [results] for metric in results.keys(): results[metric].write_to_excel( os.path.join(writer.logdir, 'results_{}.xlsx'.format(metric)), sheet_name='task {}'.format(subset_name), column_name='task {}'.format( self.config['schedule_simple'][t_idx]), info='avg') # ================================================================================================================= # calculate scores for trained tasks. prefix = 'tally_' # prefix for tensorboard plotting and csv filename targets_total = torch.cat(targets_total, axis=0) probs_total = torch.cat(probs_total, axis=0) predicts_total = probs_total > 0.5 # BCE style predicts total_metric = ['CP', 'CR', 'CF1', 'OP', 'OR', 'OF1', 'mAP'] results = dict() # reset results CP, CR, CF1, OP, OR, OF1, mAP = (AverageMeter() for _ in range(len(total_metric))) ncats = targets_total.sum(axis=0) # ignore classes in future tasks cats_in_task_idx = ncats > 0 cats_in_task_name = self.category_map[cats_in_task_idx].tolist() targets_total = targets_total probs_total = probs_total predicts_total = predicts_total # calculate score precision_pc = torch.mean( precision_score_per_class(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0)) recall_pc = torch.mean( recall_score_per_class(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0)) # CF1. note that CF1 is not a mean value of categories' f1_score f1_pc = ((2 * precision_pc * recall_pc) / (precision_pc + recall_pc) ) if (precision_pc + recall_pc) > 0 else torch.tensor([0.]) precision_oa = precision_score_overall( targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0) recall_oa = recall_score_overall(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0) f1_oa = f1_score_overall(targets_total[:, cats_in_task_idx], predicts_total[:, cats_in_task_idx], zero_division=0) map_ = mean_average_precision(targets_total[:, cats_in_task_idx], probs_total[:, cats_in_task_idx]) # save to AverageMeter CP.update(precision_pc.item()) CR.update(recall_pc.item()) CF1.update(f1_pc.item()) OP.update(precision_oa.item()) OR.update(recall_oa.item()) OF1.update(f1_oa.item()) mAP.update(map_.item()) results[prefix + 'CP'] = CP results[prefix + 'CR'] = CR results[prefix + 'CF1'] = CF1 results[prefix + 'OP'] = OP results[prefix + 'OR'] = OR results[prefix + 'OF1'] = OF1 results[prefix + 'mAP'] = mAP # for reporting major, moderate, minor cateogory performances for report_name in self.split_cats_dict.keys(): reporter = Group_AverageMeter() # get report category idxes all_cats = self.category_map.tolist() task_cats = set(cats_in_task_name) report_cats = task_cats & set(self.split_cats_dict[report_name]) report_cats_idx = torch.tensor( [all_cats.index(cat) for cat in report_cats], dtype=torch.long) # CP, CR, CF1 performance of report_categories. _class_precision = precision_score_per_class( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _class_recall = recall_score_per_class( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _class_precision = torch.mean(_class_precision) _class_recall = torch.mean(_class_recall) # CF1 bias. note that CF1 is not a mean value of categories' f1_score _class_f1 = ((2*_class_precision*_class_recall)/(_class_precision+_class_recall)) \ if (_class_precision+_class_recall)>0 else torch.tensor([0.]) # OP, OR, OF1 performance of report_categories. _overall_precision = precision_score_overall( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _overall_recall = recall_score_overall( targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) _overall_f1 = f1_score_overall(targets_total[:, report_cats_idx], predicts_total[:, report_cats_idx], zero_division=0) # mAP performance of report_categories. _mAP = mean_average_precision(targets_total[:, report_cats_idx], probs_total[:, report_cats_idx]) reporter.update(['CP'], [_class_precision.item()], [1]) reporter.update(['CR'], [_class_recall.item()], [1]) reporter.update(['CF1'], [_class_f1.item()], [1]) reporter.update(['OP'], [_overall_precision.item()], [1]) reporter.update(['OR'], [_overall_recall.item()], [1]) reporter.update(['OF1'], [_overall_f1.item()], [1]) reporter.update(['mAP'], [_mAP.item()], [1]) reporter.total.reset() results[prefix + report_name] = reporter # write to tensorboard and csv. task_len = t_idx + 1 for metric in results.keys(): if not metric in [ prefix + 'CP', prefix + 'CR', prefix + 'OP', prefix + 'OR' ]: results[metric].write( writer, '%s/%s/%s/task_len(%d)' % (metric, eval_title, self.name, task_len), step, info='avg') results[metric].write_to_excel( os.path.join(writer.logdir, 'results_{}.xlsx'.format(metric)), sheet_name=prefix, column_name='task {}'.format( self.config['schedule_simple'][t_idx]), info='avg') # ================================================================================================================= # print performances at the end if t_idx == len(self.config['schedule_simple']) - 1: src = writer.logdir csv_files = ['major', 'moderate', 'minor', 'OF1', 'CF1', 'mAP', \ prefix+'major', prefix+'moderate', prefix+'minor', prefix+'CF1', prefix+'OF1', prefix+'mAP', \ 'forget'] for csv_file in csv_files: try: csv = pd.read_csv(os.path.join( src, 'results_{}.csv'.format(csv_file)), index_col=0) # print performance after training last task pd.set_option('display.max_rows', None) print( colorful.bold_green( '\n{:10} result'.format(csv_file)).styled_string) print(csv.round(4).iloc[:, -1]) # save as txt with open(os.path.join(src, 'summary.txt'), 'a') as summary_txt: summary_txt.write('\n') summary_txt.write('{:10} result\n'.format(csv_file)) summary_txt.write(csv.round(4).iloc[:, -1].to_string()) summary_txt.write('\n') except FileNotFoundError: print("This excperiment doesn't have {} file!! continue.". format(csv_file)) continue model.train(training) return results_dict