def validate(validation_loader, model, loss_fn, device, print_frequency = 2,curr_epoch=1,column_split_order=[]):
    history = {
        'loss': [],
        'accuracy':[],
        'batch_time':[],
        'classification_metrics':None,
        'confusion_matrix':None
    }
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    progress = ProgressMeter(
        len(validation_loader),
        [batch_time, data_time, losses, top1],
        prefix="Epoch: [{}]".format(curr_epoch))

    # switch to train mode
        # switch to evaluate mode
    model.eval()
    conf_matrix = None
    if len(column_split_order) > 0:
        conf_matrix = ConfusionMatrix(column_split_order)

    with torch.no_grad():
        # https://github.com/pytorch/pytorch/issues/16417#issuecomment-566654504
        end = time.time()
        for i, (input_ids,attention_mask, labels) in enumerate(validation_loader):
            # measure data loading time
            data_time.update(time.time() - end)

            input_ids = input_ids.to(device, non_blocking=True)
            attention_mask = attention_mask.to(device, non_blocking=True)
            labels = torch.argmax(labels,dim=1).to(device,non_blocking=True)
            # compute output
            output = model(input_ids,attention_mask=attention_mask)

            loss = loss_fn(output, labels)

            # measure accuracy and record loss
            acc1 = accuracy(output, labels,conf_matrix=conf_matrix)

            losses.update(loss.item(), input_ids.size(0))
            top1.update(acc1[0].tolist()[0], input_ids.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if i % print_frequency == 0:
                progress.display(i)
        
        history['accuracy'].append(float(top1.avg))
        history['loss'].append(float(losses.avg))
        history['batch_time'].append(float(batch_time.avg))
        if conf_matrix is not None:
            history['classification_metrics'] = conf_matrix.get_all_metrics()
            history['confusion_matrix'] = str(conf_matrix)

    return history
Example #2
0
def train(model: nn.Module,
          dataset: Dataset,
          validate_data: Dataset = None) -> None:
    loader = DataLoader(dataset,
                        batch_size=dataset.BATCH_SIZE,
                        shuffle=dataset.BATCH_SIZE)

    optimizer = getattr(torch.optim,
                        config.TRAIN.OPTIMIZER)(model.parameters(),
                                                **config.TRAIN.OPTIM_PARAMS)
    overall_iter = 0
    evaluation = ConfusionMatrix(dataset.get_num_class())

    model.train()
    for epoch in range(config.TRAIN.NUM_EPOCHS):
        total_loss = 0
        for batch_idx, samples in enumerate(loader):
            images, target = device([samples['image'], samples['mask']],
                                    gpu=config.USE_GPU)
            outputs = model(images)['out']
            output_mask = outputs.argmax(1)

            batch_loss = Loss.cross_entropy2D(outputs, target, False)
            total_loss += batch_loss.item()
            overall_loss = total_loss / ((batch_idx + 1))
            evaluation.update(output_mask, target)

            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

            if batch_idx % config.PRINT_BATCH_FREQ == 0:
                metrics = evaluation()
                logger.info(f'Train Epoch: {epoch}, {batch_idx}')
                logger.info(
                    f'Batch loss: {batch_loss.item():.6f}, Overall loss: {overall_loss:.6f}'
                )
                for met in beautify(metrics[0]):
                    logger.info(f'{met}')
                logger.info(f'Classwise IoU')
                for met in beautify(metrics[1]):
                    logger.info(f'{met}')
                logger.info("\n")

            overall_iter += 1
            if config.SAVE_ITER_FREQ and overall_iter % config.SAVE_ITER_FREQ == 0:
                torch.save(
                    model.state_dict(),
                    os.path.join(config.LOG_PATH,
                                 config.NAME + f"-iter={overall_iter}"))
Example #3
0
def train_dat1():
    # data = getD1Linear()
    # XTrain, YTrain, XTest, YTest = next(data.getStratifiedKFold())
    # AllX, AllY = data.get()
    data = getD2()
    XTrain, YTrain = data.get("train")
    XTest, YTest = data.get("test")
    AllX, AllY = data.get()

    # layers = [Layer(2, 4), Layer(4, 1)]
    # model = Network(layers, [sigmoid, sigmoid], [dSigmoid, dSigmoid], euclideanLoss, dEuclideanLoss)
    layers = [Layer(64, 40), Layer(40, 10)]
    activations = [acti.sigmoid, acti.softmax]
    dActivations = [acti.dSigmoid, acti.dLinearUnit]
    model = Network(layers, activations, dActivations,
                    lossFuncs.crossEntropyLoss, lossFuncs.dCrossEntropyLoss_dSoftmax,
                    lossFuncs.crossEntropyPredict)

    epochs = 10000
    learningRate = 0.01
    regLambda = 0

    loss = {"train": [0.0] * epochs, "test": [0.0] * epochs}

    YTrainOneHot = make_one_hot(YTrain)
    YTestOneHot = make_one_hot(YTest)
    for epoch in tqdm(range(epochs)):
        # model.fit(XTrain, YTrain, learningRate, regLambda)
        model.fit(XTrain, YTrainOneHot, learningRate, regLambda)
        loss["train"][epoch] = model.getLoss(XTrain, YTrainOneHot, regLambda)
        loss["test"][epoch] = model.getLoss(XTest, YTestOneHot, regLambda)
        if (epoch + 1) % 1000 == 0:
            print("epoch {0}: ltrain: {1:0.6f}, ltest: {2:0.6f}".format(epoch + 1, loss["train"][epoch], loss["test"][epoch]))
            pred_xtest = model.predict(XTest)
            cm = ConfusionMatrix(YTest, pred_xtest, is_one_hot=False)
            print(cm.output_metrics())

    plt.plot([i for i in range(epochs)], loss["train"], label="train")
    plt.plot([i for i in range(epochs)], loss["test"], label="test")
    plt.legend()
    plt.title("Loss Plot")
    plt.xlabel("Epoch")
    plt.show()
Example #4
0
    def build_metrics(self,
                      num_classes: int,
                      class_labels=None,
                      k: int = 5,
                      additional_metrics: list = None) -> MetricsContainer:
        """
        Build different metrics. Metrics that are always included are learning rate, loss and accuracy.

        :param num_classes: Number of classes for the current dataset
        :param class_labels: A list of length 'num_classes' with class labels
        :param k: k for top-k accuracy; if k <= 1: don't include top-k accuracy
        :param additional_metrics: additional metrics in a list
        :return: container that holds all metrics
        """
        # https://medium.com/data-science-in-your-pocket/calculating-precision-recall-for-multi-class-classification-9055931ee229
        # https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2?gi=a28f7efba99e

        is_eval = self.session_type == "evaluation"
        metrics_list = []

        if not is_eval:
            metrics_list.append(Mean("training_loss"))
        metrics_list.append(Mean("validation_loss"))

        if not is_eval:
            metrics_list.append(MultiClassAccuracy("training_accuracy"))

        metrics_list.append(MultiClassAccuracy("validation_accuracy"))

        conf_a = ConfusionMatrix(num_classes,
                                 "validation_confusion",
                                 class_labels=class_labels)
        conf_b = ConfusionMatrix(num_classes,
                                 "training_confusion",
                                 class_labels=class_labels)
        conf_a.write_to_summary_interval = 1 if is_eval else 5
        conf_b.write_to_summary_interval = 1 if is_eval else 5

        if not is_eval:
            metrics_list.append(conf_b)
        metrics_list.append(conf_a)

        # if not is_eval:
        #     chart = AccuracyBarChart(num_classes, "train_val_diff", class_labels)
        #     chart.write_to_summary_interval = 5
        #     metrics_list.append(chart)

        if k > 1:
            if not is_eval:
                metrics_list.append(
                    TopKAccuracy(f"training_top{k}_accuracy", k=k))
            metrics_list.append(
                TopKAccuracy(f"validation_top{k}_accuracy", k=k))

        if additional_metrics:
            metrics_list.extend(additional_metrics)

        if not is_eval:
            metrics_list.append(SimpleMetric("lr"))
        return MetricsContainer(metrics_list)
 def _run_evaluate(self, test_data):
     pr_curve_data = []
     cm = ConfusionMatrix(self.classes)
     accuracy_list = []
     # test_iter = self._create_iter(test_data, self.config.wbatchsize,
     #                               random_shuffler=utils.identity_fun)
     test_iter = self.chunks(test_data)
     for test_batch in test_iter:
         test_batch = batch_utils.seq_pad_concat(test_batch, -1)
         pred, acc = self._predict_batch(cm, test_batch)
         accuracy_list.append(acc)
         pr_curve_data.append(
             (F.softmax(pred, -1)[:, 1].data, test_batch.labels.data))
     accuracy = 100 * (sum(accuracy_list) / len(test_data))
     return cm, accuracy, pr_curve_data
Example #6
0
    def _run_evaluate(self, test_data, addn_test):
        pr_curve_data = []
        cm = ConfusionMatrix(self.classes)
        accuracy_list = []
        # test_iter = self._create_iter(test_data, self.config.wbatchsize,
        #                               random_shuffler=utils.identity_fun)
        test_iter = self.chunks(test_data)

        for batch_index, test_batch in enumerate(test_iter):
            addn_batch = addn_test[batch_index * 15:(batch_index + 1) * 15]
            test_batch = batch_utils.seq_pad_concat(test_batch, -1)
            #             print(addn_batch.shape)
            try:
                pred, acc = self._predict_batch(cm, test_batch, addn_batch)
            except:
                continue
            accuracy_list.append(acc)
            pr_curve_data.append(
                (F.softmax(pred, -1)[:, 1].data, test_batch.labels.data))
        accuracy = 100 * (sum(accuracy_list) / len(test_data))
        return cm, accuracy, pr_curve_data
Example #7
0
    def update_settings(self):

        self.sord = SORDLoss(n_classes=self.hparams.num_classes, masking=self.hparams.masking, ranks=self.hparams.ranks, dist=self.hparams.dist, alpha = self.hparams.dist_alpha)
        self.ce = nn.CrossEntropyLoss(ignore_index=-1)
        self.kl = KLLoss(n_classes=self.hparams.num_classes, masking=self.hparams.masking)
        self.loss = CompareLosses(n_classes=self.hparams.num_classes, masking=self.hparams.masking, ranks=self.hparams.ranks, dist=self.hparams.dist, returnloss="kl")
        self.dist = Mistakes(ranks=self.hparams.ranks)
        # self.IoU = IoU(num_classes=self.hparams.num_classes, ignore_index=self.hparams.ignore_index)
        self.hparams.labels_orig = set(range(self.hparams.num_classes))
        self.hparams.labels_orig = list(self.hparams.labels_orig)
        self.IoU = MaskedIoU(labels=self.hparams.labels_orig)

        self.num_cls = 3 if self.hparams.mode == "convert" else self.hparams.num_classes
        self.hparams.labels_conv = set(range(self.num_cls))
        self.hparams.labels_conv = list(self.hparams.labels_conv)

        self.CM = ConfusionMatrix(labels=self.hparams.labels_conv)
        # self.IoU_conv = IoU(num_classes=self.num_cls, ignore_index=0)
        self.IoU_conv = MaskedIoU(labels=self.hparams.labels_conv)

        self.result_folder = f"results/{self.hparams.dataset}/"
        self.hparams.save_prefix = f"{timestamp}-{self.hparams.dataset}-c{self.hparams.num_classes}-{self.hparams.loss}"
        if self.hparams.loss == "sord":
            self.hparams.save_prefix += f'-{",".join([str(r) for r in self.hparams.ranks])}'
            self.hparams.save_prefix += f'-a{self.hparams.dist_alpha}-{self.hparams.dist}'
        if self.hparams.loss_weight:
            self.hparams.save_prefix += "-lw"
        self.hparams.save_prefix += f'-{",".join(self.hparams.modalities)}'
        logger.info(self.hparams.save_prefix)
        if self.hparams.save_xp is None:
            create_folder(f"{self.result_folder}/viz_per_epoch")
            create_folder(f"{self.result_folder}/gt")
            create_folder(f"{self.result_folder}/orig")

        if self.hparams.loss == "compare":
            create_folder(f"results/loss_weight/{self.hparams.dataset}")
Example #8
0
    def _run_epoch(self, train_data, dev_data, unlabel_data, addn_data,
                   addn_data_unlab, addn_dev, ek, ek_t, ek_u, graph_embs,
                   graph_embs_t, graph_embs_u):
        addn_dev.cuda()
        ek_t.cuda()
        graph_embs_t.cuda()
        report_stats = utils.Statistics()
        cm = ConfusionMatrix(self.classes)
        _, seq_data = list(zip(*train_data))
        total_seq_words = len(list(itertools.chain.from_iterable(seq_data)))
        iter_per_epoch = (1.5 * total_seq_words) // self.config.wbatchsize

        self.encoder.train()
        self.clf.train()

        train_iter = self._create_iter(train_data, self.config.wbatchsize)

        unlabel_iter = self._create_iter(unlabel_data,
                                         self.config.wbatchsize_unlabel)

        sofar = 0
        sofar_1 = 0
        for batch_index, train_batch_raw in enumerate(train_iter):
            seq_iter = list(zip(*train_batch_raw))[1]
            seq_words = len(list(itertools.chain.from_iterable(seq_iter)))
            report_stats.n_words += seq_words
            self.global_steps += 1

            # self.enc_clf_opt.zero_grad()
            if self.config.add_noise:
                train_batch_raw = add_noise(train_batch_raw,
                                            self.config.noise_dropout,
                                            self.config.random_permutation)
            train_batch = batch_utils.seq_pad_concat(train_batch_raw, -1)

            train_embedded = self.embedder(train_batch)

            memory_bank_train, enc_final_train = self.encoder(
                train_embedded, train_batch)

            if self.config.lambda_vat > 0 or self.config.lambda_ae > 0 or self.config.lambda_entropy:
                try:
                    unlabel_batch_raw = next(unlabel_iter)
                except StopIteration:
                    unlabel_iter = self._create_iter(
                        unlabel_data, self.config.wbatchsize_unlabel)
                    unlabel_batch_raw = next(unlabel_iter)

                if self.config.add_noise:
                    unlabel_batch_raw = add_noise(
                        unlabel_batch_raw, self.config.noise_dropout,
                        self.config.random_permutation)
                unlabel_batch = batch_utils.seq_pad_concat(
                    unlabel_batch_raw, -1)
                unlabel_embedded = self.embedder(unlabel_batch)
                memory_bank_unlabel, enc_final_unlabel = self.encoder(
                    unlabel_embedded, unlabel_batch)
                addn_batch_unlab = retAddnBatch(addn_data_unlab,
                                                memory_bank_unlabel.shape[0],
                                                sofar_1).cuda()
                ek_batch_unlab = retAddnBatch(ek_u,
                                              memory_bank_unlabel.shape[0],
                                              sofar_1).cuda()
                graph_embs_unlab = retAddnBatch(graph_embs_u,
                                                memory_bank_unlabel.shape[0],
                                                sofar_1).cuda()
                sofar_1 += addn_batch_unlab.shape[0]
                if sofar_1 >= ek_u.shape[0]:
                    sofar_1 = 0
            addn_batch = retAddnBatch(addn_data, memory_bank_train.shape[0],
                                      sofar).cuda()
            ek_batch = retAddnBatch(ek, memory_bank_train.shape[0],
                                    sofar).cuda()
            graph_embs_batch = retAddnBatch(graph_embs,
                                            memory_bank_train.shape[0],
                                            sofar).cuda()
            sofar += addn_batch.shape[0]
            if sofar >= ek.shape[0]:
                sofar = 0
            pred = self.clf(memory_bank_train, addn_batch, ek_batch,
                            enc_final_train, graph_embs_batch)
            accuracy = self.get_accuracy(cm, pred.data,
                                         train_batch.labels.data)
            lclf = self.clf_loss(pred, train_batch.labels)

            lat = Variable(
                torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE))
            lvat = Variable(
                torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE))
            if self.config.lambda_at > 0:
                lat = at_loss(
                    self.embedder,
                    self.encoder,
                    self.clf,
                    train_batch,
                    addn_batch,
                    ek_batch,
                    graph_embs_batch,
                    perturb_norm_length=self.config.perturb_norm_length)

            if self.config.lambda_vat > 0:
                lvat_train = vat_loss(
                    self.embedder,
                    self.encoder,
                    self.clf,
                    train_batch,
                    addn_batch,
                    ek_batch,
                    graph_embs_batch,
                    p_logit=pred,
                    perturb_norm_length=self.config.perturb_norm_length)
                if self.config.inc_unlabeled_loss:
                    if memory_bank_unlabel.shape[0] != ek_batch_unlab.shape[0]:
                        print(
                            f'Skipping; Unequal Shapes: {memory_bank_unlabel.shape} and {ek_batch_unlab.shape}'
                        )
                        continue
                    else:
                        lvat_unlabel = vat_loss(
                            self.embedder,
                            self.encoder,
                            self.clf,
                            unlabel_batch,
                            addn_batch_unlab,
                            ek_batch_unlab,
                            graph_embs_unlab,
                            p_logit=self.clf(memory_bank_unlabel,
                                             addn_batch_unlab, ek_batch_unlab,
                                             enc_final_unlabel,
                                             graph_embs_unlab),
                            perturb_norm_length=self.config.perturb_norm_length
                        )
                    if self.config.unlabeled_loss_type == "AvgTrainUnlabel":
                        lvat = 0.5 * (lvat_train + lvat_unlabel)
                    elif self.config.unlabeled_loss_type == "Unlabel":
                        lvat = lvat_unlabel
                else:
                    lvat = lvat_train

            lentropy = Variable(
                torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE))
            if self.config.lambda_entropy > 0:
                lentropy_train = entropy_loss(pred)
                if self.config.inc_unlabeled_loss:
                    lentropy_unlabel = entropy_loss(
                        self.clf(memory_bank_unlabel, addn_batch_unlab,
                                 ek_batch_unlab, enc_final_unlabel,
                                 graph_embs_unlab))
                    if self.config.unlabeled_loss_type == "AvgTrainUnlabel":
                        lentropy = 0.5 * (lentropy_train + lentropy_unlabel)
                    elif self.config.unlabeled_loss_type == "Unlabel":
                        lentropy = lentropy_unlabel
                else:
                    lentropy = lentropy_train

            lae = Variable(
                torch.FloatTensor([-1.]).type(batch_utils.FLOAT_TYPE))
            if self.config.lambda_ae > 0:
                lae = self.ae(memory_bank_unlabel, enc_final_unlabel,
                              unlabel_batch.sent_len, unlabel_batch_raw)

            ltotal = (self.config.lambda_clf * lclf) + \
                     (self.config.lambda_ae * lae) + \
                     (self.config.lambda_at * lat) + \
                     (self.config.lambda_vat * lvat) + \
                     (self.config.lambda_entropy * lentropy)

            report_stats.clf_loss += lclf.data.cpu().numpy()
            report_stats.at_loss += lat.data.cpu().numpy()
            report_stats.vat_loss += lvat.data.cpu().numpy()
            report_stats.ae_loss += lae.data.cpu().numpy()
            report_stats.entropy_loss += lentropy.data.cpu().numpy()
            report_stats.n_sent += len(pred)
            report_stats.n_correct += accuracy
            self.enc_clf_opt.zero_grad()
            ltotal.backward()

            params_list = self._get_trainabe_modules()
            # Excluding embedder form norm constraint when AT or VAT
            if not self.config.normalize_embedding:
                params_list += list(self.embedder.parameters())

            norm = torch.nn.utils.clip_grad_norm(params_list,
                                                 self.config.max_norm)
            report_stats.grad_norm += norm
            self.enc_clf_opt.step()
            if self.config.scheduler == "ExponentialLR":
                self.scheduler.step()
            self.ema_embedder.apply(self.embedder.named_parameters())
            self.ema_encoder.apply(self.encoder.named_parameters())
            self.ema_clf.apply(self.clf.named_parameters())

            report_func(self.epoch, batch_index, iter_per_epoch, self.time_s,
                        report_stats, self.config.report_every, self.logger)

            if self.global_steps % self.config.eval_steps == 0:
                cm_, accuracy, prc_dev = self._run_evaluate(
                    dev_data, addn_dev, ek_t, graph_embs_t)
                self.logger.info(
                    "- dev accuracy {} | best dev accuracy {} ".format(
                        accuracy, self.best_accuracy))
                self.writer.add_scalar("Dev_Accuracy", accuracy,
                                       self.global_steps)
                pred_, lab_ = zip(*prc_dev)
                pred_ = torch.cat(pred_)
                lab_ = torch.cat(lab_)
                self.writer.add_pr_curve("Dev PR-Curve", lab_, pred_,
                                         self.global_steps)
                pprint.pprint(cm_)
                pprint.pprint(cm_.get_all_metrics())
                if accuracy > self.best_accuracy:
                    self.logger.info("- new best score!")
                    self.best_accuracy = accuracy
                    self._save_model()
                if self.config.scheduler == "ReduceLROnPlateau":
                    self.scheduler.step(accuracy)
                self.encoder.train()
                #                 self.embedder.train()
                self.clf.train()

                if self.config.weight_decay > 0:
                    print(">> Square Norm: %1.4f " % self._get_l2_norm_loss())

        cm, train_accuracy, _ = self._run_evaluate(train_data, addn_data, ek,
                                                   graph_embs)
        self.logger.info("- Train accuracy  {}".format(train_accuracy))
        pprint.pprint(cm.get_all_metrics())

        cm, dev_accuracy, _ = self._run_evaluate(dev_data, addn_dev, ek_t,
                                                 graph_embs_t)
        self.logger.info("- Dev accuracy  {} | best dev accuracy {}".format(
            dev_accuracy, self.best_accuracy))
        pprint.pprint(cm.get_all_metrics())
        self.writer.add_scalars("Overall_Accuracy", {
            "Train_Accuracy": train_accuracy,
            "Dev_Accuracy": dev_accuracy
        }, self.global_steps)
        return dev_accuracy
def main():

    file_name = sys.argv[1]
    dirname = os.path.dirname(__file__)
    file_name = os.path.join(dirname, file_name)

    d = DataSet(file_name)
    d.loadDataSet()

    to_remove = [
        d.data_set[0].index('Index'),
        d.data_set[0].index('First Name'),
        d.data_set[0].index('Last Name'),
        d.data_set[0].index('Birthday'),
        d.data_set[0].index('Best Hand'),
        d.data_set[0].index('Hogwarts House'),

        # Tests 7/10/18
        d.data_set[0].index('Arithmancy'),
        d.data_set[0].index('Defense Against the Dark Arts'),
        d.data_set[0].index('Divination'),
        d.data_set[0].index('Muggle Studies'),
        d.data_set[0].index('History of Magic'),
        d.data_set[0].index('Transfiguration'),
        d.data_set[0].index('Potions'),
        d.data_set[0].index('Care of Magical Creatures'),
        d.data_set[0].index('Charms'),
        d.data_set[0].index('Flying'),
    ]

    X = np.array([[
        d.data_set[i][j] for j in range(len(d.data_set[0]))
        if j not in to_remove
    ] for i in range(len(d.data_set))])
    X = convert_to_float(X[1:, ])

    y_col_nb = d.data_set[0].index('Hogwarts House')
    y = np.array(d.extractColumn(y_col_nb)[1:])

    m = MeanImputation(X)
    m.train()
    m.transform()

    sc = Scaling(X)
    sc.train()
    sc.transform()

    sp = SplitTrainTest(X, y)
    sp.Split()
    X_train = sp.X_train
    y_train = sp.y_train
    X_test = sp.X_test
    y_test = sp.y_test

    l = LogisticRegression(X=X_train,
                           y=y_train,
                           optimizer='sgd',
                           optimizer_params={
                               'alpha': 0.5,
                               'n': 5,
                               'batch_size': 16
                           })
    l.train()

    y_predicted = l.predict()

    cm1 = ConfusionMatrix(y_train, y_predicted)
    cm1.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the training set ****************'
    )
    print('\n')
    cm1.Print()

    y_predicted = l.predict(X_test)

    cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels)
    cm2.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the testing set ****************'
    )
    print('\n')
    cm2.Print()
Example #10
0
def evaluate(cfg, ckpt_dir=None, use_gpu=False, use_mpio=False, **kwargs):
    np.set_printoptions(precision=5, suppress=True)

    startup_prog = fluid.Program()
    test_prog = fluid.Program()
    dataset = SegDataset(
        file_list=cfg.DATASET.VAL_FILE_LIST,
        mode=ModelPhase.EVAL,
        data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        #TODO: check is batch reader compatitable with Windows
        if use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        for b in data_gen:
            yield b[0], b[1], b[2]

    data_loader, avg_loss, pred, grts, masks = build_model(
        test_prog, startup_prog, phase=ModelPhase.EVAL, arch=kwargs['arch'])

    data_loader.set_sample_generator(
        data_generator, drop_last=False, batch_size=cfg.BATCH_SIZE)

    # Get device environment
    places = fluid.cuda_places() if use_gpu else fluid.cpu_places()
    place = places[0]
    dev_count = len(places)
    print("#Device count: {}".format(dev_count))

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    test_prog = test_prog.clone(for_test=True)

    ckpt_dir = cfg.TEST.TEST_MODEL if not ckpt_dir else ckpt_dir

    if not os.path.exists(ckpt_dir):
        raise ValueError('The TEST.TEST_MODEL {} is not found'.format(ckpt_dir))

    if ckpt_dir is not None:
        print('load test model:', ckpt_dir)
        try:
            fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe)
        except:
            fluid.io.load_params(exe, ckpt_dir, main_program=test_prog)

    # Use streaming confusion matrix to calculate mean_iou
    np.set_printoptions(
        precision=4, suppress=True, linewidth=160, floatmode="fixed")
    conf_mat = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
    fetch_list = [avg_loss.name, pred.name, grts.name, masks.name]
    num_images = 0
    step = 0
    all_step = cfg.DATASET.TEST_TOTAL_IMAGES // cfg.BATCH_SIZE + 1
    timer = Timer()
    timer.start()
    data_loader.start()
    while True:
        try:
            step += 1
            loss, pred, grts, masks = exe.run(
                test_prog, fetch_list=fetch_list, return_numpy=True)

            loss = np.mean(np.array(loss))

            num_images += pred.shape[0]
            conf_mat.calculate(pred, grts, masks)
            _, iou = conf_mat.mean_iou()
            _, acc = conf_mat.accuracy()

            speed = 1.0 / timer.elapsed_time()

            print(
                "[EVAL]step={} loss={:.5f} acc={:.4f} IoU={:.4f} step/sec={:.2f} | ETA {}"
                .format(step, loss, acc, iou, speed,
                        calculate_eta(all_step - step, speed)))
            timer.restart()
            sys.stdout.flush()
        except fluid.core.EOFException:
            break

    category_iou, avg_iou = conf_mat.mean_iou()
    category_acc, avg_acc = conf_mat.accuracy()
    print("[EVAL]#image={} acc={:.4f} IoU={:.4f}".format(
        num_images, avg_acc, avg_iou))
    print("[EVAL]Category IoU:", category_iou)
    print("[EVAL]Category Acc:", category_acc)
    print("[EVAL]Kappa:{:.4f}".format(conf_mat.kappa()))

    return category_iou, avg_iou, category_acc, avg_acc
Example #11
0
def train(cfg):
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    drop_last = True

    dataset = SegDataset(
        file_list=cfg.DATASET.TRAIN_FILE_LIST,
        mode=ModelPhase.TRAIN,
        shuffle=True,
        data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#Device count: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print_info("batch_size_per_dev: {}".format(batch_size_per_dev))

    data_loader, avg_loss, lr, pred, grts, masks = build_model(
        train_prog, startup_prog, phase=ModelPhase.TRAIN)
    build_model(test_prog, fluid.Program(), phase=ModelPhase.EVAL)
    data_loader.set_sample_generator(
        data_generator, batch_size=batch_size_per_dev, drop_last=drop_last)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()

    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print_info("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print_info(
                "Sync BatchNorm strategy will not be effective if GPU device"
                " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        exec_strategy=exec_strategy,
        build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, train_prog)
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR)
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    fetch_list = [avg_loss.name, lr.name]
    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(
            precision=4, suppress=True, linewidth=160, floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_vdl:
        if not args.vdl_log_dir:
            print_info("Please specify the log directory by --vdl_log_dir.")
            exit(1)

        from visualdl import LogWriter
        log_writer = LogWriter(args.vdl_log_dir)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    best_mIoU = 0.0

    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError(
            ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        data_loader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        if args.use_vdl:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  step)
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/step/sec', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, lr = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, speed,
                                 calculate_eta(all_step - step, speed)))
                        if args.use_vdl:
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/speed', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        timer.restart()

                    # NOTE : used for benchmark, profiler tools
                    if args.is_profiler and epoch == 1 and step == args.log_steps:
                        profiler.start_profiler("All")
                    elif args.is_profiler and epoch == 1 and step == args.log_steps + 5:
                        profiler.stop_profiler("total", args.profiler_path)
                        return

            except fluid.core.EOFException:
                data_loader.reset()
                break
            except Exception as e:
                print(e)

        if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0
                or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(train_prog, epoch)
            save_infer_program(test_prog, ckpt_dir)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(
                    cfg=cfg,
                    ckpt_dir=ckpt_dir,
                    use_gpu=args.use_gpu,
                    use_mpio=args.use_mpio)
                if args.use_vdl:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step)

                if mean_iou > best_mIoU:
                    best_mIoU = mean_iou
                    update_best_model(ckpt_dir)
                    print_info("Save best model {} to {}, mIoU = {:.4f}".format(
                        ckpt_dir,
                        os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
                        mean_iou))

            # Use VisualDL to visualize results
            if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(
                    cfg=cfg,
                    use_gpu=args.use_gpu,
                    vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                    vis_dir="visual",
                    ckpt_dir=ckpt_dir,
                    log_writer=log_writer)

    # save final model
    if cfg.TRAINER_ID == 0:
        ckpt_dir = save_checkpoint(train_prog, 'final')
        save_infer_program(test_prog, ckpt_dir)
Example #12
0
def evaluate(ckpt_dir=None):
    np.set_printoptions(precision=5, suppress=True)
    startup_prog = fluid.Program()
    test_prog = fluid.Program()
    dataset = SegDataset(file_list=cfg["val_list"],
                         mode=ModelPhase.EVAL,
                         data_dir=cfg["data_dir"])

    def data_generator():
        data_gen = dataset.generator()
        for b in data_gen:
            yield b[0], b[1], b[2]

    data_loader, avg_loss, pred, grts, masks = build_model(
        test_prog, startup_prog, phase=ModelPhase.EVAL)
    data_loader.set_sample_generator(data_generator,
                                     drop_last=False,
                                     batch_size=cfg["batch_size"])

    places = fluid.cuda_places()
    place = places[0]
    dev_count = len(places)
    print("#Device count: {}".format(dev_count))
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    test_prog = test_prog.clone(for_test=True)
    fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe)
    #fluid.io.load_params(exe, ckpt_dir, main_program=test_prog)
    np.set_printoptions(precision=4,
                        suppress=True,
                        linewidth=160,
                        floatmode="fixed")
    conf_mat = ConfusionMatrix(20, streaming=True)
    fetch_list = [avg_loss.name, pred.name, grts.name, masks.name]
    num_images = 0
    step = 0
    all_step = cfg["test_images"] // cfg["batch_size"] + 1
    timer = Timer()
    timer.start()
    data_loader.start()
    while True:
        try:
            step += 1
            loss, pred, grts, masks = exe.run(test_prog,
                                              fetch_list=fetch_list,
                                              return_numpy=True)
            loss = np.mean(np.array(loss))
            num_images += pred.shape[0]
            conf_mat.calculate(pred, grts, masks)
            _, iou = conf_mat.mean_iou()
            _, acc = conf_mat.accuracy()
            speed = 1.0 / timer.elapsed_time()
            print(
                "[EVAL]step={} loss={:.5f} acc={:.4f} IoU={:.4f} step/sec={:.2f} | ETA {}"
                .format(step, loss, acc, iou, speed,
                        calculate_eta(all_step - step, speed)))
            timer.restart()
            sys.stdout.flush()
        except fluid.core.EOFException:
            break
    category_iou, avg_iou = conf_mat.mean_iou()
    category_acc, avg_acc = conf_mat.accuracy()
    print("[EVAL]#image={} acc={:.4f} IoU={:.4f}".format(
        num_images, avg_acc, avg_iou))
    print("[EVAL]Category IoU:", category_iou)
    print("[EVAL]Category Acc:", category_acc)
    print("[EVAL]Kappa:{:.4f}".format(conf_mat.kappa()))

    return category_iou, avg_iou, category_acc, avg_acc
def evaluate(cfg, ckpt_dir=None, use_gpu=False, vis=False, vis_dir='vis_out/test_public', use_mpio=False, **kwargs):
    np.set_printoptions(precision=5, suppress=True)

    startup_prog = fluid.Program()
    test_prog = fluid.Program()
    dataset = SegDataset(
        file_list=cfg.DATASET.VAL_FILE_LIST,
        mode=ModelPhase.EVAL,
        data_dir=cfg.DATASET.DATA_DIR)

    fls = []
    with open(cfg.DATASET.VAL_FILE_LIST) as fr:
        for line in fr.readlines():
            fls.append(line.strip().split(' ')[0])
    if vis:
        assert cfg.VIS.VISINEVAL is True
        if not os.path.exists(vis_dir):
            os.makedirs(vis_dir)

    def data_generator():
        #TODO: check is batch reader compatitable with Windows
        if use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        for b in data_gen:
            if cfg.DATASET.INPUT_IMAGE_NUM == 1:
                yield b[0], b[1], b[2]
            else:
                yield b[0], b[1], b[2], b[3]

    data_loader, avg_loss, pred, grts, masks = build_model(
        test_prog, startup_prog, phase=ModelPhase.EVAL)

    data_loader.set_sample_generator(
        data_generator, drop_last=False, batch_size=cfg.BATCH_SIZE)

    # Get device environment
    places = fluid.cuda_places() if use_gpu else fluid.cpu_places()
    place = places[0]
    dev_count = len(places)
    print("#Device count: {}".format(dev_count))

    exe = fluid.Executor(place)
    exe.run(startup_prog)
    test_prog = test_prog.clone(for_test=True)

    ckpt_dir = cfg.TEST.TEST_MODEL if not ckpt_dir else ckpt_dir

    if not os.path.exists(ckpt_dir):
        raise ValueError('The TEST.TEST_MODEL {} is not found'.format(ckpt_dir))

    if ckpt_dir is not None:
        print('load test model:', ckpt_dir)
        try:
            fluid.load(test_prog, os.path.join(ckpt_dir, 'model'), exe)
        except:
            fluid.io.load_params(exe, ckpt_dir, main_program=test_prog)

    # Use streaming confusion matrix to calculate mean_iou
    np.set_printoptions(
        precision=4, suppress=True, linewidth=160, floatmode="fixed")
    class_num = cfg.DATASET.NUM_CLASSES
    conf_mat = ConfusionMatrix(class_num, streaming=True)
    fetch_list = [avg_loss.name, pred.name, grts.name, masks.name]
    num_images = 0
    step = 0
    all_step = cfg.DATASET.TEST_TOTAL_IMAGES // cfg.BATCH_SIZE + 1
    timer = Timer()
    timer.start()
    data_loader.start()
    cnt = 0
    while True:
        try:
            step += 1
            loss, pred, grts, masks = exe.run(
                test_prog, fetch_list=fetch_list, return_numpy=True)
            if vis:
                preds = np.array(pred, dtype=np.float32)
                for j in range(preds.shape[0]):
                    if cnt > len(fls): continue
                    name = fls[cnt].split('/')[-1].split('.')[0]
                    p = np.squeeze(preds[j])
                    np.save(os.path.join(vis_dir, name + '.npy'), p)
                    cnt += 1
                print('vis %d npy... (%d tif sample)' % (cnt, cnt//36))
                continue

            loss = np.mean(np.array(loss))

            num_images += pred.shape[0]
            conf_mat.calculate(pred, grts, masks)
            _, iou = conf_mat.mean_iou()
            _, acc = conf_mat.accuracy()
            fwiou = conf_mat.frequency_weighted_iou()

            speed = 1.0 / timer.elapsed_time()

            print(
                "[EVAL]step={} loss={:.5f} acc={:.4f} IoU={:.4f} FWIoU={:.4f} step/sec={:.2f} | ETA {}"
                .format(step, loss, acc, iou, fwiou, speed, calculate_eta(all_step - step, speed)))
            timer.restart()
            sys.stdout.flush()
        except fluid.core.EOFException:
            break

    if vis:
        return

    category_iou, avg_iou = conf_mat.mean_iou()
    category_acc, avg_acc = conf_mat.accuracy()
    fwiou = conf_mat.frequency_weighted_iou()
    print("[EVAL]#image={} acc={:.4f} IoU={:.4f} FWIoU={:.4f}".format(
        num_images, avg_acc, avg_iou, fwiou))
    print("[EVAL]Category Acc:", category_acc)
    print("[EVAL]Category IoU:", category_iou)
    print("[EVAL]Kappa: {:.4f}".format(conf_mat.kappa()))

    return category_iou, avg_iou, category_acc, avg_acc
Example #14
0
from unet_rcnn import UnetRCNN
from metrics import Accuracy, Precision, Recall, AvgMeterWrapper, ConfusionMatrix, MetricWrapper, Auc, MultiLabelIoU, MultiLabelAccuracy
from loss import RgLoss
from dataset import RgLoader
__all__ = ['model_dict', 'loader_dict', 'loss_dict', 'metric_dict']

model_dict = {
    'UnetRCNN': UnetRCNN,
}

loader_dict = {'RgLoader': RgLoader}

loss_dict = {'RgLoss': RgLoss}

metric_dict = {
    'cls': {
        'Accuracy': AvgMeterWrapper(Accuracy()),
        'Specificity & Recall': AvgMeterWrapper(Recall()),
        'NPV & Precision': AvgMeterWrapper(Precision()),
        'AUC': MetricWrapper(metric=Auc(), idx=1),
        'Confusion_Matrix': ConfusionMatrix(2)
    },
    'seg': {
        'IoU': AvgMeterWrapper(MultiLabelIoU())
    },
    'kf': {
        'Localization_Accuracy': AvgMeterWrapper(MultiLabelAccuracy())
    },
}
def predict_worker(proc_id,
                   output_file,
                   classes,
                   model_params,
                   batch_size,
                   que,
                   lock,
                   status_que,
                   gpu_id=0,
                   evaluate=True,
                   framework='mxnet'):
    """ get data from batch loader and make predictions, predictions will be saved in output_file
        if evaluate, will evaluate recall, precision, f1_score and recall_top5 """

    logging.info('Predictor #{}: Loading model...'.format(proc_id))
    model = load_model(proc_id,
                       model_params,
                       batch_size,
                       classes,
                       gpu_id,
                       framework=framework)
    if model is None:
        status_que.put('Error')
        raise ValueError('No model created! Exit')
    logging.info('Predictor #{}: Model loaded'.format(proc_id))
    status_que.put('OK')

    if evaluate:
        from metrics import F1, ConfusionMatrix, MisClassified, RecallTopK
        evaluator = F1(len(classes))
        misclassified = MisClassified(len(classes))
        cm = ConfusionMatrix(classes)
        recall_topk = RecallTopK(len(classes), top_k=5)

    f = open(output_file, 'w')
    batch_idx = 0
    logging.info('Predictor #{} starts'.format(proc_id))
    start = time.time()
    while True:
        # get a batch from data loader via a queue
        lock.acquire()
        batch = que.get()
        lock.release()
        if batch == 'FINISH':
            logging.info(
                'Predictor #{} has received all batches, exit'.format(proc_id))
            break

        # predict
        im_names, batch, gt_list = batch
        logging.debug('Predictor #{}: predict'.format(proc_id))
        pred, prob = model.predict(batch)
        pred_labels, top_probs = model.get_label_prob(top_k=5)

        # write prediction to file
        for im_name, label, top_prob in zip(im_names, pred_labels, top_probs):
            if im_name is None:
                continue
            top_prob = [str(p) for p in top_prob]
            f.write('{} labels:{} prob:{}\n'.format(im_name, ','.join(label),
                                                    ','.join(top_prob)))

        # update metrics if evaluation mode is set
        if evaluate:
            assert gt_list is not None and gt_list != [] and gt_list[
                0] is not None
            top1_int = [p[0] for p in pred]
            assert len(top1_int) == len(gt_list), '{} != {}'.format(
                len(top1_int), len(gt_list))
            evaluator.update(top1_int, gt_list)
            misclassified.update(top1_int, gt_list, prob, im_names)
            cm.update(top1_int, gt_list)

            top5_int = [p[:5] for p in pred]
            assert len(top5_int) == len(gt_list), '{} != {}'.format(
                len(top5_int), len(gt_list))
            recall_topk.update(top5_int, gt_list)

        batch_idx += 1
        if batch_idx % 50 == 0 and batch_idx != 0:
            elapsed = time.time() - start
            logging.info(
                'Predictor #{}: Tested {} batches of {} images, elapsed {}s'.
                format(proc_id, batch_idx, batch_size, elapsed))

    # evaluation after prediction if set
    if evaluate:
        logging.info('Evaluating...')
        recall, precision, f1_score = evaluator.get()
        for rec, prec, f1, cls, in zip(recall, precision, f1_score, classes):
            print(
                'Class {:<20}: recall: {:<12}, precsion: {:<12}, f1 score: {:<12}'
                .format(cls, rec, prec, f1))
            f.write(
                'Class {:<20}: recall: {:<12}, precsion: {:<12}, f1 score: {:<12}\n'
                .format(cls, rec, prec, f1))
        topk_recall = recall_topk.get()
        for rec, cls in zip(topk_recall, classes):
            print('Class {:<20}: recall-top-5: {:<12}'.format(cls, rec))
            f.write('Class {:<20}: recall-top-5: {:<12}\n'.format(cls, rec))

        fp_images, fn_images = misclassified.get()
        g = open(output_file + '.fp', 'w')
        for cls, fp_cls in zip(classes, fp_images):
            for fp in fp_cls:
                g.write('{} pred:{} prob:{} gt:{} prob:{}\n'.format(
                    fp[0], cls, fp[2], classes[fp[1]], fp[3]))
        g.close()
        g = open(output_file + '.fn', 'w')
        for cls, fn_cls in zip(classes, fn_images):
            for fn in fn_cls:
                g.write('{} gt:{} prob:{} pred:{} prob:{}\n'.format(
                    fp[0], cls, fp[3], classes[fp[1]], fp[2]))
        g.close()

        cm.normalize()
        plt_name = output_file + '_cm.jpg'
        cm.draw(plt_name)
    f.close()
Example #16
0
def train(cfg):
    # startup_prog = fluid.Program()
    # train_prog = fluid.Program()

    drop_last = True

    dataset = SegDataset(
        file_list=cfg.DATASET.TRAIN_FILE_LIST,
        mode=ModelPhase.TRAIN,
        shuffle=True,
        data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    # place = places[0]
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#Device count: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print_info("batch_size_per_dev: {}".format(batch_size_per_dev))

    data_loader, loss, lr, pred, grts, masks, image = build_model(
        phase=ModelPhase.TRAIN)
    data_loader.set_sample_generator(
        data_generator, batch_size=batch_size_per_dev, drop_last=drop_last)

    exe = fluid.Executor(place)

    cfg.update_from_file(args.teacher_cfg_file)
    # teacher_arch = teacher_cfg.architecture
    teacher_program = fluid.Program()
    teacher_startup_program = fluid.Program()

    with fluid.program_guard(teacher_program, teacher_startup_program):
        with fluid.unique_name.guard():
            _, teacher_loss, _, _, _, _, _ = build_model(
                teacher_program,
                teacher_startup_program,
                phase=ModelPhase.TRAIN,
                image=image,
                label=grts,
                mask=masks)

    exe.run(teacher_startup_program)

    teacher_program = teacher_program.clone(for_test=True)
    ckpt_dir = cfg.SLIM.KNOWLEDGE_DISTILL_TEACHER_MODEL_DIR
    assert ckpt_dir is not None
    print('load teacher model:', ckpt_dir)
    if os.path.exists(ckpt_dir):
        try:
            fluid.load(teacher_program, os.path.join(ckpt_dir, 'model'), exe)
        except:
            fluid.io.load_params(exe, ckpt_dir, main_program=teacher_program)

    # cfg = load_config(FLAGS.config)
    cfg.update_from_file(args.cfg_file)
    data_name_map = {
        'image': 'image',
        'label': 'label',
        'mask': 'mask',
    }
    merge(teacher_program, fluid.default_main_program(), data_name_map, place)
    distill_pairs = [[
        'teacher_bilinear_interp_2.tmp_0', 'bilinear_interp_0.tmp_0'
    ]]

    def distill(pairs, weight):
        """
        Add 3 pairs of distillation losses, each pair of feature maps is the
        input of teacher and student's yolov3_loss respectively
        """
        loss = l2_loss(pairs[0][0], pairs[0][1])
        weighted_loss = loss * weight
        return weighted_loss

    distill_loss = distill(distill_pairs, 0.1)
    cfg.update_from_file(args.cfg_file)
    optimizer = solver.Solver(None, None)
    all_loss = loss + distill_loss
    lr = optimizer.optimise(all_loss)

    exe.run(fluid.default_startup_program())

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False
    build_strategy.fuse_all_optimizer_ops = False
    build_strategy.fuse_elewise_add_act_ops = True
    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy,
                                             fluid.default_main_program())
        exec_strategy.num_threads = 1

    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print_info("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print_info(
                "Sync BatchNorm strategy will not be effective if GPU device"
                " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(
        fluid.default_main_program()).with_data_parallel(
            loss_name=all_loss.name,
            exec_strategy=exec_strategy,
            build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, fluid.default_main_program())
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        load_pretrained_weights(exe, fluid.default_main_program(),
                                cfg.TRAIN.PRETRAINED_MODEL_DIR)
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    #fetch_list = [avg_loss.name, lr.name]
    fetch_list = [
        loss.name, 'teacher_' + teacher_loss.name, distill_loss.name, lr.name
    ]

    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(
            precision=4, suppress=True, linewidth=160, floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_vdl:
        if not args.vdl_log_dir:
            print_info("Please specify the log directory by --vdl_log_dir.")
            exit(1)

        from visualdl import LogWriter
        log_writer = LogWriter(args.vdl_log_dir)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    avg_t_loss = 0.0
    avg_d_loss = 0.0
    best_mIoU = 0.0

    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError(
            ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        data_loader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        if args.use_vdl:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  step)
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/step/sec', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, t_loss, d_loss, lr = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    avg_t_loss += np.mean(np.array(t_loss))
                    avg_d_loss += np.mean(np.array(d_loss))
                    step += 1

                    if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        avg_t_loss /= args.log_steps
                        avg_d_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, avg_t_loss,
                                 avg_d_loss, speed,
                                 calculate_eta(all_step - step, speed)))
                        if args.use_vdl:
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/speed', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        avg_t_loss = 0.0
                        avg_d_loss = 0.0
                        timer.restart()

            except fluid.core.EOFException:
                data_loader.reset()
                break
            except Exception as e:
                print(e)

        if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0
                or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(
                    cfg=cfg,
                    ckpt_dir=ckpt_dir,
                    use_gpu=args.use_gpu,
                    use_mpio=args.use_mpio)
                if args.use_vdl:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step)

                if mean_iou > best_mIoU:
                    best_mIoU = mean_iou
                    update_best_model(ckpt_dir)
                    print_info("Save best model {} to {}, mIoU = {:.4f}".format(
                        ckpt_dir,
                        os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
                        mean_iou))

            # Use VisualDL to visualize results
            if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(
                    cfg=cfg,
                    use_gpu=args.use_gpu,
                    vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                    vis_dir="visual",
                    ckpt_dir=ckpt_dir,
                    log_writer=log_writer)
        if cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch)

    # save final model
    if cfg.TRAINER_ID == 0:
        save_checkpoint(fluid.default_main_program(), 'final')
def train(hparams, num_epoch, tuning):

    log_dir = './results/'
    test_batch_size = 8
    # Load dataset
    training_set, valid_set = make_dataset(BATCH_SIZE=hparams['HP_BS'],
                                           file_name='train_tf_record',
                                           split=True)
    test_set = make_dataset(BATCH_SIZE=test_batch_size,
                            file_name='test_tf_record',
                            split=False)
    class_names = ['NRDR', 'RDR']

    # Model
    model = ResNet()

    # set optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=hparams['HP_LR'])
    # set metrics
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
    valid_accuracy = tf.keras.metrics.Accuracy()
    valid_con_mat = ConfusionMatrix(num_class=2)
    test_accuracy = tf.keras.metrics.Accuracy()
    test_con_mat = ConfusionMatrix(num_class=2)

    # Save Checkpoint
    if not tuning:
        ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                                   optimizer=optimizer,
                                   net=model)
        manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=5)

    # Set up summary writers
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tb_log_dir = log_dir + current_time + '/train'
    summary_writer = tf.summary.create_file_writer(tb_log_dir)

    # Restore Checkpoint
    if not tuning:
        ckpt.restore(manager.latest_checkpoint)
        if manager.latest_checkpoint:
            logging.info('Restored from {}'.format(manager.latest_checkpoint))
        else:
            logging.info('Initializing from scratch.')

    @tf.function
    def train_step(train_img, train_label):
        # Optimize the model
        loss_value, grads = grad(model, train_img, train_label)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        train_pred, _ = model(train_img)
        train_label = tf.expand_dims(train_label, axis=1)
        train_accuracy.update_state(train_label, train_pred)

    for epoch in range(num_epoch):

        begin = time()

        # Training loop
        for train_img, train_label, train_name in training_set:
            train_img = data_augmentation(train_img)
            train_step(train_img, train_label)

        with summary_writer.as_default():
            tf.summary.scalar('Train Accuracy',
                              train_accuracy.result(),
                              step=epoch)

        for valid_img, valid_label, _ in valid_set:
            valid_img = tf.cast(valid_img, tf.float32)
            valid_img = valid_img / 255.0
            valid_pred, _ = model(valid_img, training=False)
            valid_pred = tf.cast(tf.argmax(valid_pred, axis=1), dtype=tf.int64)
            valid_con_mat.update_state(valid_label, valid_pred)
            valid_accuracy.update_state(valid_label, valid_pred)

        # Log the confusion matrix as an image summary
        cm_valid = valid_con_mat.result()
        figure = plot_confusion_matrix(cm_valid, class_names=class_names)
        cm_valid_image = plot_to_image(figure)

        with summary_writer.as_default():
            tf.summary.scalar('Valid Accuracy',
                              valid_accuracy.result(),
                              step=epoch)
            tf.summary.image('Valid ConfusionMatrix',
                             cm_valid_image,
                             step=epoch)

        end = time()
        logging.info(
            "Epoch {:d} Training Accuracy: {:.3%} Validation Accuracy: {:.3%} Time:{:.5}s"
            .format(epoch + 1, train_accuracy.result(),
                    valid_accuracy.result(), (end - begin)))
        train_accuracy.reset_states()
        valid_accuracy.reset_states()
        valid_con_mat.reset_states()
        if not tuning:
            if int(ckpt.step) % 5 == 0:
                save_path = manager.save()
                logging.info('Saved checkpoint for epoch {}: {}'.format(
                    int(ckpt.step), save_path))
            ckpt.step.assign_add(1)

    for test_img, test_label, _ in test_set:
        test_img = tf.cast(test_img, tf.float32)
        test_img = test_img / 255.0
        test_pred, _ = model(test_img, training=False)
        test_pred = tf.cast(tf.argmax(test_pred, axis=1), dtype=tf.int64)
        test_accuracy.update_state(test_label, test_pred)
        test_con_mat.update_state(test_label, test_pred)

    cm_test = test_con_mat.result()
    # Log the confusion matrix as an image summary
    figure = plot_confusion_matrix(cm_test, class_names=class_names)
    cm_test_image = plot_to_image(figure)
    with summary_writer.as_default():
        tf.summary.scalar('Test Accuracy', test_accuracy.result(), step=epoch)
        tf.summary.image('Test ConfusionMatrix', cm_test_image, step=epoch)

    logging.info("Trained finished. Final Accuracy in test set: {:.3%}".format(
        test_accuracy.result()))

    # Visualization
    if not tuning:
        for vis_img, vis_label, vis_name in test_set:
            vis_label = vis_label[0]
            vis_name = vis_name[0]
            vis_img = tf.cast(vis_img[0], tf.float32)
            vis_img = tf.expand_dims(vis_img, axis=0)
            vis_img = vis_img / 255.0
            with tf.GradientTape() as tape:
                vis_pred, conv_output = model(vis_img, training=False)
                pred_label = tf.argmax(vis_pred, axis=-1)
                vis_pred = tf.reduce_max(vis_pred, axis=-1)
                grad_1 = tape.gradient(vis_pred, conv_output)
                weight = tf.reduce_mean(grad_1, axis=[1, 2]) / grad_1.shape[1]
                act_map0 = tf.nn.relu(
                    tf.reduce_sum(weight * conv_output, axis=-1))
                act_map0 = tf.squeeze(tf.image.resize(tf.expand_dims(act_map0,
                                                                     axis=-1),
                                                      (256, 256),
                                                      antialias=True),
                                      axis=-1)
                plot_map(vis_img, act_map0, vis_pred, pred_label, vis_label,
                         vis_name)
            break

    return test_accuracy.result()
Example #18
0
                                 trees_number,
                                 m=attributes_in_division,
                                 pool=pool,
                                 seed=seed)
        end = time.time()

        if verbose > 1:
            print("=" * 50)
            print("Forest {} generation time: {}s".format(
                i + 1, "{0:.3f}".format(end - start)))

        results = forest.predict_df(test)
        total_results.append(results)

        if verbose > 1:
            confusion_matrix = ConfusionMatrix(results)
            confusion_matrix.show(verbose=(verbose > 2))

    final_confusion_matrix = ConfusionMatrix(pd.concat(total_results))
    total_end = time.time()

    print("=" * 50)
    print(f"Results for {data_path.replace('.csv', '')}:")
    print(
        f"Params: k_folds: {k_folds_number}; ntree: {trees_number}; m: {attributes_in_division}; seed: {seed}"
    )

    final_confusion_matrix.show(verbose=(verbose > 0))
    execution_time = total_end - total_start
    print(f"Total processing time: {execution_time:0.3f}s")
Example #19
0
def train(cfg):
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    drop_last = True

    dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST,
                         mode=ModelPhase.TRAIN,
                         shuffle=True,
                         data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    # place = places[0]
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#Device count: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print_info("batch_size_per_dev: {}".format(batch_size_per_dev))

    py_reader, avg_loss, lr, pred, grts, masks = build_model(
        train_prog, startup_prog, phase=ModelPhase.TRAIN)
    py_reader.decorate_sample_generator(data_generator,
                                        batch_size=batch_size_per_dev,
                                        drop_last=drop_last)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()

    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print_info("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print_info(
                "Sync BatchNorm strategy will not be effective if GPU device"
                " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        exec_strategy=exec_strategy,
        build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, train_prog)
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR)
        load_vars = []
        load_fail_vars = []

        def var_shape_matched(var, shape):
            """
            Check whehter persitable variable shape is match with current network
            """
            var_exist = os.path.exists(
                os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name))
            if var_exist:
                var_shape = parse_shape_from_file(
                    os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name))
                return var_shape == shape
            return False

        for x in train_prog.list_vars():
            if isinstance(x, fluid.framework.Parameter):
                shape = tuple(fluid.global_scope().find_var(
                    x.name).get_tensor().shape())
                if var_shape_matched(x, shape):
                    load_vars.append(x)
                else:
                    load_fail_vars.append(x)

        fluid.io.load_vars(exe,
                           dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR,
                           vars=load_vars)
        for var in load_vars:
            print_info("Parameter[{}] loaded sucessfully!".format(var.name))
        for var in load_fail_vars:
            print_info(
                "Parameter[{}] don't exist or shape does not match current network, skip"
                " to load it.".format(var.name))
        print_info("{}/{} pretrained parameters loaded successfully!".format(
            len(load_vars),
            len(load_vars) + len(load_fail_vars)))
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    fetch_list = [avg_loss.name, lr.name]
    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(precision=4,
                            suppress=True,
                            linewidth=160,
                            floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_tb:
        if not args.tb_log_dir:
            print_info("Please specify the log directory by --tb_log_dir.")
            exit(1)

        from tb_paddle import SummaryWriter
        log_writer = SummaryWriter(args.tb_log_dir)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    global_step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError((
            "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        py_reader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    global_step += 1

                    if global_step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, global_step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - global_step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        if args.use_tb:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  global_step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  global_step)
                            log_writer.add_scalar('Train/loss', avg_loss,
                                                  global_step)
                            log_writer.add_scalar('Train/lr', lr[0],
                                                  global_step)
                            log_writer.add_scalar('Train/step/sec', speed,
                                                  global_step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, lr = exe.run(program=compiled_train_prog,
                                       fetch_list=fetch_list,
                                       return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    global_step += 1

                    if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, global_step, lr[0], avg_loss, speed,
                                 calculate_eta(all_step - global_step, speed)))
                        if args.use_tb:
                            log_writer.add_scalar('Train/loss', avg_loss,
                                                  global_step)
                            log_writer.add_scalar('Train/lr', lr[0],
                                                  global_step)
                            log_writer.add_scalar('Train/speed', speed,
                                                  global_step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        timer.restart()

            except fluid.core.EOFException:
                py_reader.reset()
                break
            except Exception as e:
                print(e)

        if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(exe, train_prog, epoch)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(cfg=cfg,
                                                    ckpt_dir=ckpt_dir,
                                                    use_gpu=args.use_gpu,
                                                    use_mpio=args.use_mpio)
                if args.use_tb:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
                                          global_step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
                                          global_step)

            # Use Tensorboard to visualize results
            if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(cfg=cfg,
                          use_gpu=args.use_gpu,
                          vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                          vis_dir="visual",
                          ckpt_dir=ckpt_dir,
                          log_writer=log_writer)

    # save final model
    if cfg.TRAINER_ID == 0:
        save_checkpoint(exe, train_prog, 'final')
Example #20
0
        torch.load(
            "/home/chenwy/GV/suvr/jobs/saved_models/clf_ad1nl0_mri50_hippo30_lrflip_lenet_10.17.18.focal5.lr1e4.best.pth"
        ))

criterion = FocalLoss(gamma=5)
# criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(),
                             lr=learning_rate,
                             weight_decay=1e-5)

if not evaluation:
    writer = SummaryWriter(log_dir=os.path.join(log_path, task_name))
    f_log = open(log_path + task_name + ".log", 'w')

# metrics = ConfusionMatrix(3)
metrics = ConfusionMatrix(2)
best_pred = 0
best_pred_acc = None
for epoch in range(num_epochs):
    classifier.train()
    # latest model use the below uncommented setting: mri + age + gender + edu + apoe as input ######################
    for i_batch, sample_batched in enumerate(tqdm(dataloader_train)):
        if evaluation: break
        images, lefts, rights, ages, genders, edus, apoes, labels = Variable(
            sample_batched['mri']).cuda(), Variable(
                sample_batched['left']).cuda(), Variable(
                    sample_batched['right']).cuda(), Variable(
                        sample_batched['age']).cuda(), Variable(
                            sample_batched['gender']).cuda(), Variable(
                                sample_batched['edu']).cuda(), Variable(
                                    sample_batched['apoe']).cuda(), Variable(
def main():
    '''
    Use this script to run experiments and fine-tune the algoritms
    '''

    # Load the dataset
    file_name = sys.argv[1]
    dirname = os.path.dirname(__file__)
    file_name = os.path.join(dirname, file_name)

    d = DataSet(file_name)
    d.loadDataSet()

    # Remove useless features (not numeric + bad regressors).
    to_remove = [
        d.data_set[0].index('Index'),
        d.data_set[0].index('First Name'),
        d.data_set[0].index('Last Name'),
        d.data_set[0].index('Birthday'),
        d.data_set[0].index('Best Hand'),
        d.data_set[0].index('Hogwarts House'),

        # Tests 7/10/18
        d.data_set[0].index('Arithmancy'),
        d.data_set[0].index('Defense Against the Dark Arts'),
        d.data_set[0].index('Divination'),
        d.data_set[0].index('Muggle Studies'),
        d.data_set[0].index('History of Magic'),
        d.data_set[0].index('Transfiguration'),
        d.data_set[0].index('Potions'),
        d.data_set[0].index('Care of Magical Creatures'),
        d.data_set[0].index('Charms'),
        d.data_set[0].index('Flying'),
    ]

    X = np.array([[
        d.data_set[i][j] for j in range(len(d.data_set[0]))
        if j not in to_remove
    ] for i in range(len(d.data_set))])
    X = convert_to_float(X[1:, ])

    y_col_nb = d.data_set[0].index('Hogwarts House')
    y = np.array(d.extractColumn(y_col_nb)[1:])

    # Impute missing values
    m = MeanImputation(X)
    m.train()
    m.transform()

    # Scale the variables
    sc = Scaling(X)
    sc.train()
    sc.transform()

    # Split the dataset in a training and testing set
    sp = SplitTrainTest(X, y)
    sp.Split()
    X_train = sp.X_train
    y_train = sp.y_train
    X_test = sp.X_test
    y_test = sp.y_test

    # Train a logistic regression model
    l = LogisticRegression(X=X_train, y=y_train)
    l.train()

    # Compute the confusion matrix over the training set
    y_predicted = l.predict()

    cm1 = ConfusionMatrix(y_train, y_predicted)
    cm1.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the training set ****************'
    )
    print('\n')
    cm1.Print()

    # Compute the confusion matrix over the testing set
    y_predicted = l.predict(X_test)

    cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels)
    cm2.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the testing set ****************'
    )
    print('\n')
    cm2.Print()
Example #22
0
def train(unit, dropout, learning_rate, num_epoch, tuning=True):

    num_epoch = int(num_epoch)
    log_dir = './results/'

    # Load dataset
    path = os.getcwd()
    train_file = path + '/hapt_tfrecords/hapt_train.tfrecords'
    val_file = path + '/hapt_tfrecords/hapt_val.tfrecords'
    test_file = path + '/hapt_tfrecords/hapt_test.tfrecords'

    train_dataset = make_dataset(train_file, overlap=True)
    val_dataset = make_dataset(val_file)
    test_dataset = make_dataset(test_file)
    class_names = [
        'WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING',
        'STANDING', 'LAYING', 'STAND_TO_SIT', 'SIT_TO_STAND', ' SIT_TO_LIE',
        'LIE_TO_SIT', 'STAND_TO_LIE', 'LIE_TO_STAND'
    ]

    # set a random batch number to visualize the result in test dataset.
    len_test = len(list(test_dataset))
    show_index = random.randint(10, len_test)

    # Model
    model = Lstm(unit=unit, drop_out=dropout)

    # set optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    # set Metrics
    train_accuracy = tf.keras.metrics.CategoricalAccuracy()
    val_accuracy = tf.keras.metrics.Accuracy()
    val_con_mat = ConfusionMatrix(num_class=13)
    test_accuracy = tf.keras.metrics.Accuracy()
    test_con_mat = ConfusionMatrix(num_class=13)

    # Save Checkpoint
    if not tuning:
        ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                                   optimizer=optimizer,
                                   net=model)
        manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=5)

    # Set up summary writers
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tb_log_dir = log_dir + current_time
    summary_writer = tf.summary.create_file_writer(tb_log_dir)

    # Restore Checkpoint
    if not tuning:
        ckpt.restore(manager.latest_checkpoint)
        if manager.latest_checkpoint:
            logging.info('Restored from {}'.format(manager.latest_checkpoint))
        else:
            logging.info('Initializing from scratch.')

    # calculate losses, update network and metrics.
    @tf.function
    def train_step(inputs, label):
        # Optimize the model
        loss_value, grads = grad(model, inputs, label)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        train_pred = model(inputs, training=True)
        train_pred = tf.squeeze(train_pred)
        label = tf.squeeze(label)
        train_accuracy.update_state(label,
                                    train_pred,
                                    sample_weight=sample_weight)

    for epoch in range(num_epoch):
        begin = time()

        # Training loop
        for exp_num, index, label, train_inputs in train_dataset:
            train_inputs = tf.expand_dims(train_inputs, axis=0)
            # One-hot coding is applied.
            label = label - 1
            sample_weight = tf.cast(tf.math.not_equal(label, -1), tf.int64)
            label = tf.expand_dims(tf.one_hot(label, depth=12), axis=0)
            train_step(train_inputs, label)

        for exp_num, index, label, val_inputs in val_dataset:
            val_inputs = tf.expand_dims(val_inputs, axis=0)
            sample_weight = tf.cast(
                tf.math.not_equal(label, tf.constant(0, dtype=tf.int64)),
                tf.int64)
            val_pred = model(val_inputs, training=False)
            val_pred = tf.squeeze(val_pred)
            val_pred = tf.cast(tf.argmax(val_pred, axis=1), dtype=tf.int64) + 1
            val_con_mat.update_state(label,
                                     val_pred,
                                     sample_weight=sample_weight)
            val_accuracy.update_state(label,
                                      val_pred,
                                      sample_weight=sample_weight)
        # Log the confusion matrix as an image summary
        cm_valid = val_con_mat.result()
        figure = plot_confusion_matrix(cm_valid, class_names=class_names)
        cm_valid_image = plot_to_image(figure)

        with summary_writer.as_default():
            tf.summary.scalar('Train Accuracy',
                              train_accuracy.result(),
                              step=epoch)
            tf.summary.scalar('Valid Accuracy',
                              val_accuracy.result(),
                              step=epoch)
            tf.summary.image('Valid ConfusionMatrix',
                             cm_valid_image,
                             step=epoch)
        end = time()
        logging.info(
            "Epoch {:d} Training Accuracy: {:.3%} Validation Accuracy: {:.3%} Time:{:.5}s"
            .format(epoch + 1, train_accuracy.result(), val_accuracy.result(),
                    (end - begin)))

        train_accuracy.reset_states()
        val_accuracy.reset_states()
        val_con_mat.reset_states()

        if not tuning:
            if int(ckpt.step) % 5 == 0:
                save_path = manager.save()
                logging.info('Saved checkpoint for epoch {}: {}'.format(
                    int(ckpt.step), save_path))
            ckpt.step.assign_add(1)

    i = 0
    for exp_num, index, label, test_inputs in test_dataset:
        test_inputs = tf.expand_dims(test_inputs, axis=0)
        sample_weight = tf.cast(
            tf.math.not_equal(label, tf.constant(0, dtype=tf.int64)), tf.int64)
        test_pred = model(test_inputs, training=False)
        test_pred = tf.cast(tf.argmax(test_pred, axis=2), dtype=tf.int64)
        test_pred = tf.squeeze(test_pred, axis=0) + 1
        test_accuracy.update_state(label,
                                   test_pred,
                                   sample_weight=sample_weight)
        test_con_mat.update_state(label,
                                  test_pred,
                                  sample_weight=sample_weight)
        i += 1

        # visualize the result
        if i == show_index:
            if not tuning:
                visualization_path = path + '/visualization/'
                image_path = visualization_path + current_time + '.png'
                inputs = tf.squeeze(test_inputs)
                show(index, label, inputs, test_pred, image_path)

    # Log the confusion matrix as an image summary
    cm_test = test_con_mat.result()
    figure = plot_confusion_matrix(cm_test, class_names=class_names)
    cm_test_image = plot_to_image(figure)

    with summary_writer.as_default():
        tf.summary.scalar('Test Accuracy', test_accuracy.result(), step=epoch)
        tf.summary.image('Test ConfusionMatrix', cm_test_image, step=epoch)

    logging.info("Trained finished. Final Accuracy in test set: {:.3%}".format(
        test_accuracy.result()))

    return test_accuracy.result()