def validate(validation_loader, model, loss_fn, device, print_frequency = 2,curr_epoch=1,column_split_order=[]): history = { 'loss': [], 'accuracy':[], 'batch_time':[], 'classification_metrics':None, 'confusion_matrix':None } batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') progress = ProgressMeter( len(validation_loader), [batch_time, data_time, losses, top1], prefix="Epoch: [{}]".format(curr_epoch)) # switch to train mode # switch to evaluate mode model.eval() conf_matrix = None if len(column_split_order) > 0: conf_matrix = ConfusionMatrix(column_split_order) with torch.no_grad(): # https://github.com/pytorch/pytorch/issues/16417#issuecomment-566654504 end = time.time() for i, (input_ids,attention_mask, labels) in enumerate(validation_loader): # measure data loading time data_time.update(time.time() - end) input_ids = input_ids.to(device, non_blocking=True) attention_mask = attention_mask.to(device, non_blocking=True) labels = torch.argmax(labels,dim=1).to(device,non_blocking=True) # compute output output = model(input_ids,attention_mask=attention_mask) loss = loss_fn(output, labels) # measure accuracy and record loss acc1 = accuracy(output, labels,conf_matrix=conf_matrix) losses.update(loss.item(), input_ids.size(0)) top1.update(acc1[0].tolist()[0], input_ids.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_frequency == 0: progress.display(i) history['accuracy'].append(float(top1.avg)) history['loss'].append(float(losses.avg)) history['batch_time'].append(float(batch_time.avg)) if conf_matrix is not None: history['classification_metrics'] = conf_matrix.get_all_metrics() history['confusion_matrix'] = str(conf_matrix) return history
def train(model: nn.Module, dataset: Dataset, validate_data: Dataset = None) -> None: loader = DataLoader(dataset, batch_size=dataset.BATCH_SIZE, shuffle=dataset.BATCH_SIZE) optimizer = getattr(torch.optim, config.TRAIN.OPTIMIZER)(model.parameters(), **config.TRAIN.OPTIM_PARAMS) overall_iter = 0 evaluation = ConfusionMatrix(dataset.get_num_class()) model.train() for epoch in range(config.TRAIN.NUM_EPOCHS): total_loss = 0 for batch_idx, samples in enumerate(loader): images, target = device([samples['image'], samples['mask']], gpu=config.USE_GPU) outputs = model(images)['out'] output_mask = outputs.argmax(1) batch_loss = Loss.cross_entropy2D(outputs, target, False) total_loss += batch_loss.item() overall_loss = total_loss / ((batch_idx + 1)) evaluation.update(output_mask, target) optimizer.zero_grad() batch_loss.backward() optimizer.step() if batch_idx % config.PRINT_BATCH_FREQ == 0: metrics = evaluation() logger.info(f'Train Epoch: {epoch}, {batch_idx}') logger.info( f'Batch loss: {batch_loss.item():.6f}, Overall loss: {overall_loss:.6f}' ) for met in beautify(metrics[0]): logger.info(f'{met}') logger.info(f'Classwise IoU') for met in beautify(metrics[1]): logger.info(f'{met}') logger.info("\n") overall_iter += 1 if config.SAVE_ITER_FREQ and overall_iter % config.SAVE_ITER_FREQ == 0: torch.save( model.state_dict(), os.path.join(config.LOG_PATH, config.NAME + f"-iter={overall_iter}"))
def train_dat1(): # data = getD1Linear() # XTrain, YTrain, XTest, YTest = next(data.getStratifiedKFold()) # AllX, AllY = data.get() data = getD2() XTrain, YTrain = data.get("train") XTest, YTest = data.get("test") AllX, AllY = data.get() # layers = [Layer(2, 4), Layer(4, 1)] # model = Network(layers, [sigmoid, sigmoid], [dSigmoid, dSigmoid], euclideanLoss, dEuclideanLoss) layers = [Layer(64, 40), Layer(40, 10)] activations = [acti.sigmoid, acti.softmax] dActivations = [acti.dSigmoid, acti.dLinearUnit] model = Network(layers, activations, dActivations, lossFuncs.crossEntropyLoss, lossFuncs.dCrossEntropyLoss_dSoftmax, lossFuncs.crossEntropyPredict) epochs = 10000 learningRate = 0.01 regLambda = 0 loss = {"train": [0.0] * epochs, "test": [0.0] * epochs} YTrainOneHot = make_one_hot(YTrain) YTestOneHot = make_one_hot(YTest) for epoch in tqdm(range(epochs)): # model.fit(XTrain, YTrain, learningRate, regLambda) model.fit(XTrain, YTrainOneHot, learningRate, regLambda) loss["train"][epoch] = model.getLoss(XTrain, YTrainOneHot, regLambda) loss["test"][epoch] = model.getLoss(XTest, YTestOneHot, regLambda) if (epoch + 1) % 1000 == 0: print("epoch {0}: ltrain: {1:0.6f}, ltest: {2:0.6f}".format(epoch + 1, loss["train"][epoch], loss["test"][epoch])) pred_xtest = model.predict(XTest) cm = ConfusionMatrix(YTest, pred_xtest, is_one_hot=False) print(cm.output_metrics()) plt.plot([i for i in range(epochs)], loss["train"], label="train") plt.plot([i for i in range(epochs)], loss["test"], label="test") plt.legend() plt.title("Loss Plot") plt.xlabel("Epoch") plt.show()
def build_metrics(self, num_classes: int, class_labels=None, k: int = 5, additional_metrics: list = None) -> MetricsContainer: """ Build different metrics. Metrics that are always included are learning rate, loss and accuracy. :param num_classes: Number of classes for the current dataset :param class_labels: A list of length 'num_classes' with class labels :param k: k for top-k accuracy; if k <= 1: don't include top-k accuracy :param additional_metrics: additional metrics in a list :return: container that holds all metrics """ # https://medium.com/data-science-in-your-pocket/calculating-precision-recall-for-multi-class-classification-9055931ee229 # https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2?gi=a28f7efba99e is_eval = self.session_type == "evaluation" metrics_list = [] if not is_eval: metrics_list.append(Mean("training_loss")) metrics_list.append(Mean("validation_loss")) if not is_eval: metrics_list.append(MultiClassAccuracy("training_accuracy")) metrics_list.append(MultiClassAccuracy("validation_accuracy")) conf_a = ConfusionMatrix(num_classes, "validation_confusion", class_labels=class_labels) conf_b = ConfusionMatrix(num_classes, "training_confusion", class_labels=class_labels) conf_a.write_to_summary_interval = 1 if is_eval else 5 conf_b.write_to_summary_interval = 1 if is_eval else 5 if not is_eval: metrics_list.append(conf_b) metrics_list.append(conf_a) # if not is_eval: # chart = AccuracyBarChart(num_classes, "train_val_diff", class_labels) # chart.write_to_summary_interval = 5 # metrics_list.append(chart) if k > 1: if not is_eval: metrics_list.append( TopKAccuracy(f"training_top{k}_accuracy", k=k)) metrics_list.append( TopKAccuracy(f"validation_top{k}_accuracy", k=k)) if additional_metrics: metrics_list.extend(additional_metrics) if not is_eval: metrics_list.append(SimpleMetric("lr")) return MetricsContainer(metrics_list)
def _run_evaluate(self, test_data): pr_curve_data = [] cm = ConfusionMatrix(self.classes) accuracy_list = [] # test_iter = self._create_iter(test_data, self.config.wbatchsize, # random_shuffler=utils.identity_fun) test_iter = self.chunks(test_data) for test_batch in test_iter: test_batch = batch_utils.seq_pad_concat(test_batch, -1) pred, acc = self._predict_batch(cm, test_batch) accuracy_list.append(acc) pr_curve_data.append( (F.softmax(pred, -1)[:, 1].data, test_batch.labels.data)) accuracy = 100 * (sum(accuracy_list) / len(test_data)) return cm, accuracy, pr_curve_data
def _run_evaluate(self, test_data, addn_test): pr_curve_data = [] cm = ConfusionMatrix(self.classes) accuracy_list = [] # test_iter = self._create_iter(test_data, self.config.wbatchsize, # random_shuffler=utils.identity_fun) test_iter = self.chunks(test_data) for batch_index, test_batch in enumerate(test_iter): addn_batch = addn_test[batch_index * 15:(batch_index + 1) * 15] test_batch = batch_utils.seq_pad_concat(test_batch, -1) # print(addn_batch.shape) try: pred, acc = self._predict_batch(cm, test_batch, addn_batch) except: continue accuracy_list.append(acc) pr_curve_data.append( (F.softmax(pred, -1)[:, 1].data, test_batch.labels.data)) accuracy = 100 * (sum(accuracy_list) / len(test_data)) return cm, accuracy, pr_curve_data
def update_settings(self): self.sord = SORDLoss(n_classes=self.hparams.num_classes, masking=self.hparams.masking, ranks=self.hparams.ranks, dist=self.hparams.dist, alpha = self.hparams.dist_alpha) self.ce = nn.CrossEntropyLoss(ignore_index=-1) self.kl = KLLoss(n_classes=self.hparams.num_classes, masking=self.hparams.masking) self.loss = CompareLosses(n_classes=self.hparams.num_classes, masking=self.hparams.masking, ranks=self.hparams.ranks, dist=self.hparams.dist, returnloss="kl") self.dist = Mistakes(ranks=self.hparams.ranks) # self.IoU = IoU(num_classes=self.hparams.num_classes, ignore_index=self.hparams.ignore_index) self.hparams.labels_orig = set(range(self.hparams.num_classes)) self.hparams.labels_orig = list(self.hparams.labels_orig) self.IoU = MaskedIoU(labels=self.hparams.labels_orig) self.num_cls = 3 if self.hparams.mode == "convert" else self.hparams.num_classes self.hparams.labels_conv = set(range(self.num_cls)) self.hparams.labels_conv = list(self.hparams.labels_conv) self.CM = ConfusionMatrix(labels=self.hparams.labels_conv) # self.IoU_conv = IoU(num_classes=self.num_cls, ignore_index=0) self.IoU_conv = MaskedIoU(labels=self.hparams.labels_conv) self.result_folder = f"results/{self.hparams.dataset}/" self.hparams.save_prefix = f"{timestamp}-{self.hparams.dataset}-c{self.hparams.num_classes}-{self.hparams.loss}" if self.hparams.loss == "sord": self.hparams.save_prefix += f'-{",".join([str(r) for r in self.hparams.ranks])}' self.hparams.save_prefix += f'-a{self.hparams.dist_alpha}-{self.hparams.dist}' if self.hparams.loss_weight: self.hparams.save_prefix += "-lw" self.hparams.save_prefix += f'-{",".join(self.hparams.modalities)}' logger.info(self.hparams.save_prefix) if self.hparams.save_xp is None: create_folder(f"{self.result_folder}/viz_per_epoch") create_folder(f"{self.result_folder}/gt") create_folder(f"{self.result_folder}/orig") if self.hparams.loss == "compare": create_folder(f"results/loss_weight/{self.hparams.dataset}")
def _run_epoch(self, train_data, dev_data, unlabel_data, addn_data, addn_data_unlab, addn_dev, ek, ek_t, ek_u, graph_embs, graph_embs_t, graph_embs_u): addn_dev.cuda() ek_t.cuda() graph_embs_t.cuda() report_stats = utils.Statistics() cm = ConfusionMatrix(self.classes) _, seq_data = list(zip(*train_data)) total_seq_words = len(list(itertools.chain.from_iterable(seq_data))) iter_per_epoch = (1.5 * total_seq_words) // self.config.wbatchsize self.encoder.train() self.clf.train() train_iter = self._create_iter(train_data, self.config.wbatchsize) unlabel_iter = self._create_iter(unlabel_data, self.config.wbatchsize_unlabel) sofar = 0 sofar_1 = 0 for batch_index, train_batch_raw in enumerate(train_iter): seq_iter = list(zip(*train_batch_raw))[1] seq_words = len(list(itertools.chain.from_iterable(seq_iter))) report_stats.n_words += seq_words self.global_steps += 1 # self.enc_clf_opt.zero_grad() if self.config.add_noise: train_batch_raw = add_noise(train_batch_raw, self.config.noise_dropout, self.config.random_permutation) train_batch = batch_utils.seq_pad_concat(train_batch_raw, -1) train_embedded = self.embedder(train_batch) memory_bank_train, enc_final_train = self.encoder( train_embedded, train_batch) if self.config.lambda_vat > 0 or self.config.lambda_ae > 0 or self.config.lambda_entropy: try: unlabel_batch_raw = next(unlabel_iter) except StopIteration: unlabel_iter = self._create_iter( unlabel_data, self.config.wbatchsize_unlabel) unlabel_batch_raw = next(unlabel_iter) if self.config.add_noise: unlabel_batch_raw = add_noise( unlabel_batch_raw, self.config.noise_dropout, self.config.random_permutation) unlabel_batch = batch_utils.seq_pad_concat( unlabel_batch_raw, -1) unlabel_embedded = self.embedder(unlabel_batch) memory_bank_unlabel, enc_final_unlabel = self.encoder( unlabel_embedded, unlabel_batch) addn_batch_unlab = retAddnBatch(addn_data_unlab, memory_bank_unlabel.shape[0], sofar_1).cuda() ek_batch_unlab = retAddnBatch(ek_u, memory_bank_unlabel.shape[0], sofar_1).cuda() graph_embs_unlab = retAddnBatch(graph_embs_u, memory_bank_unlabel.shape[0], sofar_1).cuda() sofar_1 += addn_batch_unlab.shape[0] if sofar_1 >= ek_u.shape[0]: sofar_1 = 0 addn_batch = retAddnBatch(addn_data, memory_bank_train.shape[0], sofar).cuda() ek_batch = retAddnBatch(ek, memory_bank_train.shape[0], sofar).cuda() graph_embs_batch = retAddnBatch(graph_embs, memory_bank_train.shape[0], sofar).cuda() sofar += addn_batch.shape[0] if sofar >= ek.shape[0]: sofar = 0 pred = self.clf(memory_bank_train, addn_batch, ek_batch, enc_final_train, graph_embs_batch) accuracy = self.get_accuracy(cm, pred.data, train_batch.labels.data) lclf = self.clf_loss(pred, train_batch.labels) lat = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) lvat = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) if self.config.lambda_at > 0: lat = at_loss( self.embedder, self.encoder, self.clf, train_batch, addn_batch, ek_batch, graph_embs_batch, perturb_norm_length=self.config.perturb_norm_length) if self.config.lambda_vat > 0: lvat_train = vat_loss( self.embedder, self.encoder, self.clf, train_batch, addn_batch, ek_batch, graph_embs_batch, p_logit=pred, perturb_norm_length=self.config.perturb_norm_length) if self.config.inc_unlabeled_loss: if memory_bank_unlabel.shape[0] != ek_batch_unlab.shape[0]: print( f'Skipping; Unequal Shapes: {memory_bank_unlabel.shape} and {ek_batch_unlab.shape}' ) continue else: lvat_unlabel = vat_loss( self.embedder, self.encoder, self.clf, unlabel_batch, addn_batch_unlab, ek_batch_unlab, graph_embs_unlab, p_logit=self.clf(memory_bank_unlabel, addn_batch_unlab, ek_batch_unlab, enc_final_unlabel, graph_embs_unlab), perturb_norm_length=self.config.perturb_norm_length ) if self.config.unlabeled_loss_type == "AvgTrainUnlabel": lvat = 0.5 * (lvat_train + lvat_unlabel) elif self.config.unlabeled_loss_type == "Unlabel": lvat = lvat_unlabel else: lvat = lvat_train lentropy = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) if self.config.lambda_entropy > 0: lentropy_train = entropy_loss(pred) if self.config.inc_unlabeled_loss: lentropy_unlabel = entropy_loss( self.clf(memory_bank_unlabel, addn_batch_unlab, ek_batch_unlab, enc_final_unlabel, graph_embs_unlab)) if self.config.unlabeled_loss_type == "AvgTrainUnlabel": lentropy = 0.5 * (lentropy_train + lentropy_unlabel) elif self.config.unlabeled_loss_type == "Unlabel": lentropy = lentropy_unlabel else: lentropy = lentropy_train lae = Variable( torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE)) if self.config.lambda_ae > 0: lae = self.ae(memory_bank_unlabel, enc_final_unlabel, unlabel_batch.sent_len, unlabel_batch_raw) ltotal = (self.config.lambda_clf * lclf) + \ (self.config.lambda_ae * lae) + \ (self.config.lambda_at * lat) + \ (self.config.lambda_vat * lvat) + \ (self.config.lambda_entropy * lentropy) report_stats.clf_loss += lclf.data.cpu().numpy() report_stats.at_loss += lat.data.cpu().numpy() report_stats.vat_loss += lvat.data.cpu().numpy() report_stats.ae_loss += lae.data.cpu().numpy() report_stats.entropy_loss += lentropy.data.cpu().numpy() report_stats.n_sent += len(pred) report_stats.n_correct += accuracy self.enc_clf_opt.zero_grad() ltotal.backward() params_list = self._get_trainabe_modules() # Excluding embedder form norm constraint when AT or VAT if not self.config.normalize_embedding: params_list += list(self.embedder.parameters()) norm = torch.nn.utils.clip_grad_norm(params_list, self.config.max_norm) report_stats.grad_norm += norm self.enc_clf_opt.step() if self.config.scheduler == "ExponentialLR": self.scheduler.step() self.ema_embedder.apply(self.embedder.named_parameters()) self.ema_encoder.apply(self.encoder.named_parameters()) self.ema_clf.apply(self.clf.named_parameters()) report_func(self.epoch, batch_index, iter_per_epoch, self.time_s, report_stats, self.config.report_every, self.logger) if self.global_steps % self.config.eval_steps == 0: cm_, accuracy, prc_dev = self._run_evaluate( dev_data, addn_dev, ek_t, graph_embs_t) self.logger.info( "- dev accuracy {} | best dev accuracy {} ".format( accuracy, self.best_accuracy)) self.writer.add_scalar("Dev_Accuracy", accuracy, self.global_steps) pred_, lab_ = zip(*prc_dev) pred_ = torch.cat(pred_) lab_ = torch.cat(lab_) self.writer.add_pr_curve("Dev PR-Curve", lab_, pred_, self.global_steps) pprint.pprint(cm_) pprint.pprint(cm_.get_all_metrics()) if accuracy > self.best_accuracy: self.logger.info("- new best score!") self.best_accuracy = accuracy self._save_model() if self.config.scheduler == "ReduceLROnPlateau": self.scheduler.step(accuracy) self.encoder.train() # self.embedder.train() self.clf.train() if self.config.weight_decay > 0: print(">> Square Norm: %1.4f " % self._get_l2_norm_loss()) cm, train_accuracy, _ = self._run_evaluate(train_data, addn_data, ek, graph_embs) self.logger.info("- Train accuracy {}".format(train_accuracy)) pprint.pprint(cm.get_all_metrics()) cm, dev_accuracy, _ = self._run_evaluate(dev_data, addn_dev, ek_t, graph_embs_t) self.logger.info("- Dev accuracy {} | best dev accuracy {}".format( dev_accuracy, self.best_accuracy)) pprint.pprint(cm.get_all_metrics()) self.writer.add_scalars("Overall_Accuracy", { "Train_Accuracy": train_accuracy, "Dev_Accuracy": dev_accuracy }, self.global_steps) return dev_accuracy
def main(): file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) m = MeanImputation(X) m.train() m.transform() sc = Scaling(X) sc.train() sc.transform() sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test l = LogisticRegression(X=X_train, y=y_train, optimizer='sgd', optimizer_params={ 'alpha': 0.5, 'n': 5, 'batch_size': 16 }) l.train() y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()
def evaluate(cfg, ckpt_dir=None, use_gpu=False, use_mpio=False, **kwargs): np.set_printoptions(precision=5, suppress=True) startup_prog = fluid.Program() test_prog = fluid.Program() dataset = SegDataset( file_list=cfg.DATASET.VAL_FILE_LIST, mode=ModelPhase.EVAL, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): #TODO: check is batch reader compatitable with Windows if use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() for b in data_gen: yield b[0], b[1], b[2] data_loader, avg_loss, pred, grts, masks = build_model( test_prog, startup_prog, phase=ModelPhase.EVAL, arch=kwargs['arch']) data_loader.set_sample_generator( data_generator, drop_last=False, batch_size=cfg.BATCH_SIZE) # Get device environment places = fluid.cuda_places() if use_gpu else fluid.cpu_places() place = places[0] dev_count = len(places) print("#Device count: {}".format(dev_count)) exe = fluid.Executor(place) exe.run(startup_prog) test_prog = test_prog.clone(for_test=True) ckpt_dir = cfg.TEST.TEST_MODEL if not ckpt_dir else ckpt_dir if not os.path.exists(ckpt_dir): raise ValueError('The TEST.TEST_MODEL {} is not found'.format(ckpt_dir)) if ckpt_dir is not None: print('load test model:', ckpt_dir) try: fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) except: fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) # Use streaming confusion matrix to calculate mean_iou np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") conf_mat = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) fetch_list = [avg_loss.name, pred.name, grts.name, masks.name] num_images = 0 step = 0 all_step = cfg.DATASET.TEST_TOTAL_IMAGES // cfg.BATCH_SIZE + 1 timer = Timer() timer.start() data_loader.start() while True: try: step += 1 loss, pred, grts, masks = exe.run( test_prog, fetch_list=fetch_list, return_numpy=True) loss = np.mean(np.array(loss)) num_images += pred.shape[0] conf_mat.calculate(pred, grts, masks) _, iou = conf_mat.mean_iou() _, acc = conf_mat.accuracy() speed = 1.0 / timer.elapsed_time() print( "[EVAL]step={} loss={:.5f} acc={:.4f} IoU={:.4f} step/sec={:.2f} | ETA {}" .format(step, loss, acc, iou, speed, calculate_eta(all_step - step, speed))) timer.restart() sys.stdout.flush() except fluid.core.EOFException: break category_iou, avg_iou = conf_mat.mean_iou() category_acc, avg_acc = conf_mat.accuracy() print("[EVAL]#image={} acc={:.4f} IoU={:.4f}".format( num_images, avg_acc, avg_iou)) print("[EVAL]Category IoU:", category_iou) print("[EVAL]Category Acc:", category_acc) print("[EVAL]Kappa:{:.4f}".format(conf_mat.kappa())) return category_iou, avg_iou, category_acc, avg_acc
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 drop_last = True dataset = SegDataset( file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) build_model(test_prog, fluid.Program(), phase=ModelPhase.EVAL) data_loader.set_sample_generator( data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError( ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 timer.restart() # NOTE : used for benchmark, profiler tools if args.is_profiler and epoch == 1 and step == args.log_steps: profiler.start_profiler("All") elif args.is_profiler and epoch == 1 and step == args.log_steps + 5: profiler.stop_profiler("total", args.profiler_path) return except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, epoch) save_infer_program(test_prog, ckpt_dir) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if mean_iou > best_mIoU: best_mIoU = mean_iou update_best_model(ckpt_dir) print_info("Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, 'final') save_infer_program(test_prog, ckpt_dir)
def evaluate(ckpt_dir=None): np.set_printoptions(precision=5, suppress=True) startup_prog = fluid.Program() test_prog = fluid.Program() dataset = SegDataset(file_list=cfg["val_list"], mode=ModelPhase.EVAL, data_dir=cfg["data_dir"]) def data_generator(): data_gen = dataset.generator() for b in data_gen: yield b[0], b[1], b[2] data_loader, avg_loss, pred, grts, masks = build_model( test_prog, startup_prog, phase=ModelPhase.EVAL) data_loader.set_sample_generator(data_generator, drop_last=False, batch_size=cfg["batch_size"]) places = fluid.cuda_places() place = places[0] dev_count = len(places) print("#Device count: {}".format(dev_count)) exe = fluid.Executor(place) exe.run(startup_prog) test_prog = test_prog.clone(for_test=True) fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) #fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") conf_mat = ConfusionMatrix(20, streaming=True) fetch_list = [avg_loss.name, pred.name, grts.name, masks.name] num_images = 0 step = 0 all_step = cfg["test_images"] // cfg["batch_size"] + 1 timer = Timer() timer.start() data_loader.start() while True: try: step += 1 loss, pred, grts, masks = exe.run(test_prog, fetch_list=fetch_list, return_numpy=True) loss = np.mean(np.array(loss)) num_images += pred.shape[0] conf_mat.calculate(pred, grts, masks) _, iou = conf_mat.mean_iou() _, acc = conf_mat.accuracy() speed = 1.0 / timer.elapsed_time() print( "[EVAL]step={} loss={:.5f} acc={:.4f} IoU={:.4f} step/sec={:.2f} | ETA {}" .format(step, loss, acc, iou, speed, calculate_eta(all_step - step, speed))) timer.restart() sys.stdout.flush() except fluid.core.EOFException: break category_iou, avg_iou = conf_mat.mean_iou() category_acc, avg_acc = conf_mat.accuracy() print("[EVAL]#image={} acc={:.4f} IoU={:.4f}".format( num_images, avg_acc, avg_iou)) print("[EVAL]Category IoU:", category_iou) print("[EVAL]Category Acc:", category_acc) print("[EVAL]Kappa:{:.4f}".format(conf_mat.kappa())) return category_iou, avg_iou, category_acc, avg_acc
def evaluate(cfg, ckpt_dir=None, use_gpu=False, vis=False, vis_dir='vis_out/test_public', use_mpio=False, **kwargs): np.set_printoptions(precision=5, suppress=True) startup_prog = fluid.Program() test_prog = fluid.Program() dataset = SegDataset( file_list=cfg.DATASET.VAL_FILE_LIST, mode=ModelPhase.EVAL, data_dir=cfg.DATASET.DATA_DIR) fls = [] with open(cfg.DATASET.VAL_FILE_LIST) as fr: for line in fr.readlines(): fls.append(line.strip().split(' ')[0]) if vis: assert cfg.VIS.VISINEVAL is True if not os.path.exists(vis_dir): os.makedirs(vis_dir) def data_generator(): #TODO: check is batch reader compatitable with Windows if use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() for b in data_gen: if cfg.DATASET.INPUT_IMAGE_NUM == 1: yield b[0], b[1], b[2] else: yield b[0], b[1], b[2], b[3] data_loader, avg_loss, pred, grts, masks = build_model( test_prog, startup_prog, phase=ModelPhase.EVAL) data_loader.set_sample_generator( data_generator, drop_last=False, batch_size=cfg.BATCH_SIZE) # Get device environment places = fluid.cuda_places() if use_gpu else fluid.cpu_places() place = places[0] dev_count = len(places) print("#Device count: {}".format(dev_count)) exe = fluid.Executor(place) exe.run(startup_prog) test_prog = test_prog.clone(for_test=True) ckpt_dir = cfg.TEST.TEST_MODEL if not ckpt_dir else ckpt_dir if not os.path.exists(ckpt_dir): raise ValueError('The TEST.TEST_MODEL {} is not found'.format(ckpt_dir)) if ckpt_dir is not None: print('load test model:', ckpt_dir) try: fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe) except: fluid.io.load_params(exe, ckpt_dir, main_program=test_prog) # Use streaming confusion matrix to calculate mean_iou np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") class_num = cfg.DATASET.NUM_CLASSES conf_mat = ConfusionMatrix(class_num, streaming=True) fetch_list = [avg_loss.name, pred.name, grts.name, masks.name] num_images = 0 step = 0 all_step = cfg.DATASET.TEST_TOTAL_IMAGES // cfg.BATCH_SIZE + 1 timer = Timer() timer.start() data_loader.start() cnt = 0 while True: try: step += 1 loss, pred, grts, masks = exe.run( test_prog, fetch_list=fetch_list, return_numpy=True) if vis: preds = np.array(pred, dtype=np.float32) for j in range(preds.shape[0]): if cnt > len(fls): continue name = fls[cnt].split('/')[-1].split('.')[0] p = np.squeeze(preds[j]) np.save(os.path.join(vis_dir, name + '.npy'), p) cnt += 1 print('vis %d npy... (%d tif sample)' % (cnt, cnt//36)) continue loss = np.mean(np.array(loss)) num_images += pred.shape[0] conf_mat.calculate(pred, grts, masks) _, iou = conf_mat.mean_iou() _, acc = conf_mat.accuracy() fwiou = conf_mat.frequency_weighted_iou() speed = 1.0 / timer.elapsed_time() print( "[EVAL]step={} loss={:.5f} acc={:.4f} IoU={:.4f} FWIoU={:.4f} step/sec={:.2f} | ETA {}" .format(step, loss, acc, iou, fwiou, speed, calculate_eta(all_step - step, speed))) timer.restart() sys.stdout.flush() except fluid.core.EOFException: break if vis: return category_iou, avg_iou = conf_mat.mean_iou() category_acc, avg_acc = conf_mat.accuracy() fwiou = conf_mat.frequency_weighted_iou() print("[EVAL]#image={} acc={:.4f} IoU={:.4f} FWIoU={:.4f}".format( num_images, avg_acc, avg_iou, fwiou)) print("[EVAL]Category Acc:", category_acc) print("[EVAL]Category IoU:", category_iou) print("[EVAL]Kappa: {:.4f}".format(conf_mat.kappa())) return category_iou, avg_iou, category_acc, avg_acc
from unet_rcnn import UnetRCNN from metrics import Accuracy, Precision, Recall, AvgMeterWrapper, ConfusionMatrix, MetricWrapper, Auc, MultiLabelIoU, MultiLabelAccuracy from loss import RgLoss from dataset import RgLoader __all__ = ['model_dict', 'loader_dict', 'loss_dict', 'metric_dict'] model_dict = { 'UnetRCNN': UnetRCNN, } loader_dict = {'RgLoader': RgLoader} loss_dict = {'RgLoss': RgLoss} metric_dict = { 'cls': { 'Accuracy': AvgMeterWrapper(Accuracy()), 'Specificity & Recall': AvgMeterWrapper(Recall()), 'NPV & Precision': AvgMeterWrapper(Precision()), 'AUC': MetricWrapper(metric=Auc(), idx=1), 'Confusion_Matrix': ConfusionMatrix(2) }, 'seg': { 'IoU': AvgMeterWrapper(MultiLabelIoU()) }, 'kf': { 'Localization_Accuracy': AvgMeterWrapper(MultiLabelAccuracy()) }, }
def predict_worker(proc_id, output_file, classes, model_params, batch_size, que, lock, status_que, gpu_id=0, evaluate=True, framework='mxnet'): """ get data from batch loader and make predictions, predictions will be saved in output_file if evaluate, will evaluate recall, precision, f1_score and recall_top5 """ logging.info('Predictor #{}: Loading model...'.format(proc_id)) model = load_model(proc_id, model_params, batch_size, classes, gpu_id, framework=framework) if model is None: status_que.put('Error') raise ValueError('No model created! Exit') logging.info('Predictor #{}: Model loaded'.format(proc_id)) status_que.put('OK') if evaluate: from metrics import F1, ConfusionMatrix, MisClassified, RecallTopK evaluator = F1(len(classes)) misclassified = MisClassified(len(classes)) cm = ConfusionMatrix(classes) recall_topk = RecallTopK(len(classes), top_k=5) f = open(output_file, 'w') batch_idx = 0 logging.info('Predictor #{} starts'.format(proc_id)) start = time.time() while True: # get a batch from data loader via a queue lock.acquire() batch = que.get() lock.release() if batch == 'FINISH': logging.info( 'Predictor #{} has received all batches, exit'.format(proc_id)) break # predict im_names, batch, gt_list = batch logging.debug('Predictor #{}: predict'.format(proc_id)) pred, prob = model.predict(batch) pred_labels, top_probs = model.get_label_prob(top_k=5) # write prediction to file for im_name, label, top_prob in zip(im_names, pred_labels, top_probs): if im_name is None: continue top_prob = [str(p) for p in top_prob] f.write('{} labels:{} prob:{}\n'.format(im_name, ','.join(label), ','.join(top_prob))) # update metrics if evaluation mode is set if evaluate: assert gt_list is not None and gt_list != [] and gt_list[ 0] is not None top1_int = [p[0] for p in pred] assert len(top1_int) == len(gt_list), '{} != {}'.format( len(top1_int), len(gt_list)) evaluator.update(top1_int, gt_list) misclassified.update(top1_int, gt_list, prob, im_names) cm.update(top1_int, gt_list) top5_int = [p[:5] for p in pred] assert len(top5_int) == len(gt_list), '{} != {}'.format( len(top5_int), len(gt_list)) recall_topk.update(top5_int, gt_list) batch_idx += 1 if batch_idx % 50 == 0 and batch_idx != 0: elapsed = time.time() - start logging.info( 'Predictor #{}: Tested {} batches of {} images, elapsed {}s'. format(proc_id, batch_idx, batch_size, elapsed)) # evaluation after prediction if set if evaluate: logging.info('Evaluating...') recall, precision, f1_score = evaluator.get() for rec, prec, f1, cls, in zip(recall, precision, f1_score, classes): print( 'Class {:<20}: recall: {:<12}, precsion: {:<12}, f1 score: {:<12}' .format(cls, rec, prec, f1)) f.write( 'Class {:<20}: recall: {:<12}, precsion: {:<12}, f1 score: {:<12}\n' .format(cls, rec, prec, f1)) topk_recall = recall_topk.get() for rec, cls in zip(topk_recall, classes): print('Class {:<20}: recall-top-5: {:<12}'.format(cls, rec)) f.write('Class {:<20}: recall-top-5: {:<12}\n'.format(cls, rec)) fp_images, fn_images = misclassified.get() g = open(output_file + '.fp', 'w') for cls, fp_cls in zip(classes, fp_images): for fp in fp_cls: g.write('{} pred:{} prob:{} gt:{} prob:{}\n'.format( fp[0], cls, fp[2], classes[fp[1]], fp[3])) g.close() g = open(output_file + '.fn', 'w') for cls, fn_cls in zip(classes, fn_images): for fn in fn_cls: g.write('{} gt:{} prob:{} pred:{} prob:{}\n'.format( fp[0], cls, fp[3], classes[fp[1]], fp[2])) g.close() cm.normalize() plt_name = output_file + '_cm.jpg' cm.draw(plt_name) f.close()
def train(cfg): # startup_prog = fluid.Program() # train_prog = fluid.Program() drop_last = True dataset = SegDataset( file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, loss, lr, pred, grts, masks, image = build_model( phase=ModelPhase.TRAIN) data_loader.set_sample_generator( data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) cfg.update_from_file(args.teacher_cfg_file) # teacher_arch = teacher_cfg.architecture teacher_program = fluid.Program() teacher_startup_program = fluid.Program() with fluid.program_guard(teacher_program, teacher_startup_program): with fluid.unique_name.guard(): _, teacher_loss, _, _, _, _, _ = build_model( teacher_program, teacher_startup_program, phase=ModelPhase.TRAIN, image=image, label=grts, mask=masks) exe.run(teacher_startup_program) teacher_program = teacher_program.clone(for_test=True) ckpt_dir = cfg.SLIM.KNOWLEDGE_DISTILL_TEACHER_MODEL_DIR assert ckpt_dir is not None print('load teacher model:', ckpt_dir) if os.path.exists(ckpt_dir): try: fluid.load(teacher_program, os.path.join(ckpt_dir, 'model'), exe) except: fluid.io.load_params(exe, ckpt_dir, main_program=teacher_program) # cfg = load_config(FLAGS.config) cfg.update_from_file(args.cfg_file) data_name_map = { 'image': 'image', 'label': 'label', 'mask': 'mask', } merge(teacher_program, fluid.default_main_program(), data_name_map, place) distill_pairs = [[ 'teacher_bilinear_interp_2.tmp_0', 'bilinear_interp_0.tmp_0' ]] def distill(pairs, weight): """ Add 3 pairs of distillation losses, each pair of feature maps is the input of teacher and student's yolov3_loss respectively """ loss = l2_loss(pairs[0][0], pairs[0][1]) weighted_loss = loss * weight return weighted_loss distill_loss = distill(distill_pairs, 0.1) cfg.update_from_file(args.cfg_file) optimizer = solver.Solver(None, None) all_loss = loss + distill_loss lr = optimizer.optimise(all_loss) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, fluid.default_main_program()) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=all_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, fluid.default_main_program()) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, fluid.default_main_program(), cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) #fetch_list = [avg_loss.name, lr.name] fetch_list = [ loss.name, 'teacher_' + teacher_loss.name, distill_loss.name, lr.name ] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 avg_t_loss = 0.0 avg_d_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError( ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, t_loss, d_loss, lr = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) avg_t_loss += np.mean(np.array(t_loss)) avg_d_loss += np.mean(np.array(d_loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps avg_t_loss /= args.log_steps avg_d_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, avg_t_loss, avg_d_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 avg_t_loss = 0.0 avg_d_loss = 0.0 timer.restart() except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if mean_iou > best_mIoU: best_mIoU = mean_iou update_best_model(ckpt_dir) print_info("Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) if cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(fluid.default_main_program(), 'final')
def train(hparams, num_epoch, tuning): log_dir = './results/' test_batch_size = 8 # Load dataset training_set, valid_set = make_dataset(BATCH_SIZE=hparams['HP_BS'], file_name='train_tf_record', split=True) test_set = make_dataset(BATCH_SIZE=test_batch_size, file_name='test_tf_record', split=False) class_names = ['NRDR', 'RDR'] # Model model = ResNet() # set optimizer optimizer = tf.keras.optimizers.Adam(learning_rate=hparams['HP_LR']) # set metrics train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() valid_accuracy = tf.keras.metrics.Accuracy() valid_con_mat = ConfusionMatrix(num_class=2) test_accuracy = tf.keras.metrics.Accuracy() test_con_mat = ConfusionMatrix(num_class=2) # Save Checkpoint if not tuning: ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model) manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=5) # Set up summary writers current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tb_log_dir = log_dir + current_time + '/train' summary_writer = tf.summary.create_file_writer(tb_log_dir) # Restore Checkpoint if not tuning: ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: logging.info('Restored from {}'.format(manager.latest_checkpoint)) else: logging.info('Initializing from scratch.') @tf.function def train_step(train_img, train_label): # Optimize the model loss_value, grads = grad(model, train_img, train_label) optimizer.apply_gradients(zip(grads, model.trainable_variables)) train_pred, _ = model(train_img) train_label = tf.expand_dims(train_label, axis=1) train_accuracy.update_state(train_label, train_pred) for epoch in range(num_epoch): begin = time() # Training loop for train_img, train_label, train_name in training_set: train_img = data_augmentation(train_img) train_step(train_img, train_label) with summary_writer.as_default(): tf.summary.scalar('Train Accuracy', train_accuracy.result(), step=epoch) for valid_img, valid_label, _ in valid_set: valid_img = tf.cast(valid_img, tf.float32) valid_img = valid_img / 255.0 valid_pred, _ = model(valid_img, training=False) valid_pred = tf.cast(tf.argmax(valid_pred, axis=1), dtype=tf.int64) valid_con_mat.update_state(valid_label, valid_pred) valid_accuracy.update_state(valid_label, valid_pred) # Log the confusion matrix as an image summary cm_valid = valid_con_mat.result() figure = plot_confusion_matrix(cm_valid, class_names=class_names) cm_valid_image = plot_to_image(figure) with summary_writer.as_default(): tf.summary.scalar('Valid Accuracy', valid_accuracy.result(), step=epoch) tf.summary.image('Valid ConfusionMatrix', cm_valid_image, step=epoch) end = time() logging.info( "Epoch {:d} Training Accuracy: {:.3%} Validation Accuracy: {:.3%} Time:{:.5}s" .format(epoch + 1, train_accuracy.result(), valid_accuracy.result(), (end - begin))) train_accuracy.reset_states() valid_accuracy.reset_states() valid_con_mat.reset_states() if not tuning: if int(ckpt.step) % 5 == 0: save_path = manager.save() logging.info('Saved checkpoint for epoch {}: {}'.format( int(ckpt.step), save_path)) ckpt.step.assign_add(1) for test_img, test_label, _ in test_set: test_img = tf.cast(test_img, tf.float32) test_img = test_img / 255.0 test_pred, _ = model(test_img, training=False) test_pred = tf.cast(tf.argmax(test_pred, axis=1), dtype=tf.int64) test_accuracy.update_state(test_label, test_pred) test_con_mat.update_state(test_label, test_pred) cm_test = test_con_mat.result() # Log the confusion matrix as an image summary figure = plot_confusion_matrix(cm_test, class_names=class_names) cm_test_image = plot_to_image(figure) with summary_writer.as_default(): tf.summary.scalar('Test Accuracy', test_accuracy.result(), step=epoch) tf.summary.image('Test ConfusionMatrix', cm_test_image, step=epoch) logging.info("Trained finished. Final Accuracy in test set: {:.3%}".format( test_accuracy.result())) # Visualization if not tuning: for vis_img, vis_label, vis_name in test_set: vis_label = vis_label[0] vis_name = vis_name[0] vis_img = tf.cast(vis_img[0], tf.float32) vis_img = tf.expand_dims(vis_img, axis=0) vis_img = vis_img / 255.0 with tf.GradientTape() as tape: vis_pred, conv_output = model(vis_img, training=False) pred_label = tf.argmax(vis_pred, axis=-1) vis_pred = tf.reduce_max(vis_pred, axis=-1) grad_1 = tape.gradient(vis_pred, conv_output) weight = tf.reduce_mean(grad_1, axis=[1, 2]) / grad_1.shape[1] act_map0 = tf.nn.relu( tf.reduce_sum(weight * conv_output, axis=-1)) act_map0 = tf.squeeze(tf.image.resize(tf.expand_dims(act_map0, axis=-1), (256, 256), antialias=True), axis=-1) plot_map(vis_img, act_map0, vis_pred, pred_label, vis_label, vis_name) break return test_accuracy.result()
trees_number, m=attributes_in_division, pool=pool, seed=seed) end = time.time() if verbose > 1: print("=" * 50) print("Forest {} generation time: {}s".format( i + 1, "{0:.3f}".format(end - start))) results = forest.predict_df(test) total_results.append(results) if verbose > 1: confusion_matrix = ConfusionMatrix(results) confusion_matrix.show(verbose=(verbose > 2)) final_confusion_matrix = ConfusionMatrix(pd.concat(total_results)) total_end = time.time() print("=" * 50) print(f"Results for {data_path.replace('.csv', '')}:") print( f"Params: k_folds: {k_folds_number}; ntree: {trees_number}; m: {attributes_in_division}; seed: {seed}" ) final_confusion_matrix.show(verbose=(verbose > 0)) execution_time = total_end - total_start print(f"Total processing time: {execution_time:0.3f}s")
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) py_reader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) py_reader.decorate_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) load_vars = [] load_fail_vars = [] def var_shape_matched(var, shape): """ Check whehter persitable variable shape is match with current network """ var_exist = os.path.exists( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) if var_exist: var_shape = parse_shape_from_file( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) return var_shape == shape return False for x in train_prog.list_vars(): if isinstance(x, fluid.framework.Parameter): shape = tuple(fluid.global_scope().find_var( x.name).get_tensor().shape()) if var_shape_matched(x, shape): load_vars.append(x) else: load_fail_vars.append(x) fluid.io.load_vars(exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) for var in load_vars: print_info("Parameter[{}] loaded sucessfully!".format(var.name)) for var in load_fail_vars: print_info( "Parameter[{}] don't exist or shape does not match current network, skip" " to load it.".format(var.name)) print_info("{}/{} pretrained parameters loaded successfully!".format( len(load_vars), len(load_vars) + len(load_fail_vars))) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_tb: if not args.tb_log_dir: print_info("Please specify the log directory by --tb_log_dir.") exit(1) from tb_paddle import SummaryWriter log_writer = SummaryWriter(args.tb_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) global_step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): py_reader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - global_step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_tb: log_writer.add_scalar('Train/mean_iou', mean_iou, global_step) log_writer.add_scalar('Train/mean_acc', mean_acc, global_step) log_writer.add_scalar('Train/loss', avg_loss, global_step) log_writer.add_scalar('Train/lr', lr[0], global_step) log_writer.add_scalar('Train/step/sec', speed, global_step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run(program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, speed, calculate_eta(all_step - global_step, speed))) if args.use_tb: log_writer.add_scalar('Train/loss', avg_loss, global_step) log_writer.add_scalar('Train/lr', lr[0], global_step) log_writer.add_scalar('Train/speed', speed, global_step) sys.stdout.flush() avg_loss = 0.0 timer.restart() except fluid.core.EOFException: py_reader.reset() break except Exception as e: print(e) if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(exe, train_prog, epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate(cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_tb: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, global_step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, global_step) # Use Tensorboard to visualize results if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: visualize(cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(exe, train_prog, 'final')
torch.load( "/home/chenwy/GV/suvr/jobs/saved_models/clf_ad1nl0_mri50_hippo30_lrflip_lenet_10.17.18.focal5.lr1e4.best.pth" )) criterion = FocalLoss(gamma=5) # criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(classifier.parameters(), lr=learning_rate, weight_decay=1e-5) if not evaluation: writer = SummaryWriter(log_dir=os.path.join(log_path, task_name)) f_log = open(log_path + task_name + ".log", 'w') # metrics = ConfusionMatrix(3) metrics = ConfusionMatrix(2) best_pred = 0 best_pred_acc = None for epoch in range(num_epochs): classifier.train() # latest model use the below uncommented setting: mri + age + gender + edu + apoe as input ###################### for i_batch, sample_batched in enumerate(tqdm(dataloader_train)): if evaluation: break images, lefts, rights, ages, genders, edus, apoes, labels = Variable( sample_batched['mri']).cuda(), Variable( sample_batched['left']).cuda(), Variable( sample_batched['right']).cuda(), Variable( sample_batched['age']).cuda(), Variable( sample_batched['gender']).cuda(), Variable( sample_batched['edu']).cuda(), Variable( sample_batched['apoe']).cuda(), Variable(
def main(): ''' Use this script to run experiments and fine-tune the algoritms ''' # Load the dataset file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() # Remove useless features (not numeric + bad regressors). to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) # Impute missing values m = MeanImputation(X) m.train() m.transform() # Scale the variables sc = Scaling(X) sc.train() sc.transform() # Split the dataset in a training and testing set sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test # Train a logistic regression model l = LogisticRegression(X=X_train, y=y_train) l.train() # Compute the confusion matrix over the training set y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() # Compute the confusion matrix over the testing set y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()
def train(unit, dropout, learning_rate, num_epoch, tuning=True): num_epoch = int(num_epoch) log_dir = './results/' # Load dataset path = os.getcwd() train_file = path + '/hapt_tfrecords/hapt_train.tfrecords' val_file = path + '/hapt_tfrecords/hapt_val.tfrecords' test_file = path + '/hapt_tfrecords/hapt_test.tfrecords' train_dataset = make_dataset(train_file, overlap=True) val_dataset = make_dataset(val_file) test_dataset = make_dataset(test_file) class_names = [ 'WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING', 'STAND_TO_SIT', 'SIT_TO_STAND', ' SIT_TO_LIE', 'LIE_TO_SIT', 'STAND_TO_LIE', 'LIE_TO_STAND' ] # set a random batch number to visualize the result in test dataset. len_test = len(list(test_dataset)) show_index = random.randint(10, len_test) # Model model = Lstm(unit=unit, drop_out=dropout) # set optimizer optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # set Metrics train_accuracy = tf.keras.metrics.CategoricalAccuracy() val_accuracy = tf.keras.metrics.Accuracy() val_con_mat = ConfusionMatrix(num_class=13) test_accuracy = tf.keras.metrics.Accuracy() test_con_mat = ConfusionMatrix(num_class=13) # Save Checkpoint if not tuning: ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model) manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=5) # Set up summary writers current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tb_log_dir = log_dir + current_time summary_writer = tf.summary.create_file_writer(tb_log_dir) # Restore Checkpoint if not tuning: ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: logging.info('Restored from {}'.format(manager.latest_checkpoint)) else: logging.info('Initializing from scratch.') # calculate losses, update network and metrics. @tf.function def train_step(inputs, label): # Optimize the model loss_value, grads = grad(model, inputs, label) optimizer.apply_gradients(zip(grads, model.trainable_variables)) train_pred = model(inputs, training=True) train_pred = tf.squeeze(train_pred) label = tf.squeeze(label) train_accuracy.update_state(label, train_pred, sample_weight=sample_weight) for epoch in range(num_epoch): begin = time() # Training loop for exp_num, index, label, train_inputs in train_dataset: train_inputs = tf.expand_dims(train_inputs, axis=0) # One-hot coding is applied. label = label - 1 sample_weight = tf.cast(tf.math.not_equal(label, -1), tf.int64) label = tf.expand_dims(tf.one_hot(label, depth=12), axis=0) train_step(train_inputs, label) for exp_num, index, label, val_inputs in val_dataset: val_inputs = tf.expand_dims(val_inputs, axis=0) sample_weight = tf.cast( tf.math.not_equal(label, tf.constant(0, dtype=tf.int64)), tf.int64) val_pred = model(val_inputs, training=False) val_pred = tf.squeeze(val_pred) val_pred = tf.cast(tf.argmax(val_pred, axis=1), dtype=tf.int64) + 1 val_con_mat.update_state(label, val_pred, sample_weight=sample_weight) val_accuracy.update_state(label, val_pred, sample_weight=sample_weight) # Log the confusion matrix as an image summary cm_valid = val_con_mat.result() figure = plot_confusion_matrix(cm_valid, class_names=class_names) cm_valid_image = plot_to_image(figure) with summary_writer.as_default(): tf.summary.scalar('Train Accuracy', train_accuracy.result(), step=epoch) tf.summary.scalar('Valid Accuracy', val_accuracy.result(), step=epoch) tf.summary.image('Valid ConfusionMatrix', cm_valid_image, step=epoch) end = time() logging.info( "Epoch {:d} Training Accuracy: {:.3%} Validation Accuracy: {:.3%} Time:{:.5}s" .format(epoch + 1, train_accuracy.result(), val_accuracy.result(), (end - begin))) train_accuracy.reset_states() val_accuracy.reset_states() val_con_mat.reset_states() if not tuning: if int(ckpt.step) % 5 == 0: save_path = manager.save() logging.info('Saved checkpoint for epoch {}: {}'.format( int(ckpt.step), save_path)) ckpt.step.assign_add(1) i = 0 for exp_num, index, label, test_inputs in test_dataset: test_inputs = tf.expand_dims(test_inputs, axis=0) sample_weight = tf.cast( tf.math.not_equal(label, tf.constant(0, dtype=tf.int64)), tf.int64) test_pred = model(test_inputs, training=False) test_pred = tf.cast(tf.argmax(test_pred, axis=2), dtype=tf.int64) test_pred = tf.squeeze(test_pred, axis=0) + 1 test_accuracy.update_state(label, test_pred, sample_weight=sample_weight) test_con_mat.update_state(label, test_pred, sample_weight=sample_weight) i += 1 # visualize the result if i == show_index: if not tuning: visualization_path = path + '/visualization/' image_path = visualization_path + current_time + '.png' inputs = tf.squeeze(test_inputs) show(index, label, inputs, test_pred, image_path) # Log the confusion matrix as an image summary cm_test = test_con_mat.result() figure = plot_confusion_matrix(cm_test, class_names=class_names) cm_test_image = plot_to_image(figure) with summary_writer.as_default(): tf.summary.scalar('Test Accuracy', test_accuracy.result(), step=epoch) tf.summary.image('Test ConfusionMatrix', cm_test_image, step=epoch) logging.info("Trained finished. Final Accuracy in test set: {:.3%}".format( test_accuracy.result())) return test_accuracy.result()