Beispiel #1
0
def training(session,
             training_op,
             cost,
             train_data,
             valid_data,
             y_upper_bound=None,
             seed=0,
             n_epochs=10,
             batch_size=128):
    """
    * train_data
    * valid_data
    はいずれも、
    * key : tensorflowのplaceholder
    * val : numpyのarray(全データ)
    のディクショナリ形式で与える
    """
    np.random.seed(seed)

    # バッチ数を計算
    n_samples_train = len(list(train_data.values())[0])
    n_samples_valid = len(list(valid_data.values())[0])
    n_batches_train = n_samples_train // batch_size
    n_batches_valid = n_samples_valid // batch_size
    mb = master_bar(range(n_epochs))

    # 学習曲線描画のための前準備
    train_costs_lst = []
    valid_costs_lst = []
    x_bounds = [0, n_epochs]
    y_bounds = None
    for epoch in mb:
        # Train
        train_costs = []
        for _ in progress_bar(range(n_batches_train), parent=mb):
            batch_idx = np.random.randint(n_samples_train, size=batch_size)

            # feedするデータを指定
            feed_dict = {}
            for k, v in train_data.items():
                feed_dict[k] = v[batch_idx]

            _, train_cost = session.run([training_op, cost],
                                        feed_dict=feed_dict)
            train_costs.append(train_cost)

        # Valid
        valid_costs = []
        for i in range(n_batches_valid):
            start = i * batch_size
            end = start + batch_size

            # feedするデータを指定
            feed_dict = {}
            for k, v in valid_data.items():
                feed_dict[k] = v[start:end]

            valid_cost = session.run(cost, feed_dict=feed_dict)
            valid_costs.append(valid_cost)

        # 損失関数の値の計算
        train_costs_mean = np.mean(train_costs)
        valid_costs_mean = np.mean(valid_costs)
        train_costs_lst.append(train_costs_mean)
        valid_costs_lst.append(valid_costs_mean)

        # learning curveの図示
        if y_bounds is None:
            # 1エポック目のみ実行
            y_bounds = [
                0, train_costs_mean *
                1.1 if y_upper_bound is None else y_upper_bound
            ]

        t = np.arange(len(train_costs_lst))
        graphs = [[t, train_costs_lst], [t, valid_costs_lst]]
        mb.update_graph(graphs, x_bounds, y_bounds)

        # 学習過程の出力
        mb.write(
            'EPOCH: {0:02d}, Training cost: {1:10.5f}, Validation cost: {2:10.5f}'
            .format(epoch + 1, train_costs_mean, valid_costs_mean))
Beispiel #2
0
def main():
    now = datetime.datetime.now()
    now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second)

    # set logger
    logger = logging.getLogger("Log")
    logger.setLevel(logging.DEBUG)

    handler_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

    # stream_handler = logging.StreamHandler()
    # stream_handler.setLevel(logging.DEBUG)
    # stream_handler.setFormatter(handler_format)
    # logger.addHandler(stream_handler)


    print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second))
    with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f:
        param = yaml.load(f, Loader=yaml.FullLoader)
    param['date'] = now_date
    # seed set
    seed_setting(param['seed'])
    if torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True

    local_cv = dict()

    for fold in param['fold']:
        # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN
        outdir = os.path.join(param['save path'], EXP_NAME ,now_date, 'fold{}'.format(fold))
        if os.path.exists(param['save path']):
            os.makedirs(outdir, exist_ok=True)
        else:
            print("Not find {}".format(param['save path']))
            raise FileNotFoundError


        file_handler = logging.FileHandler(os.path.join(outdir, 'experiment.log'))
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(handler_format)
        logger.addHandler(file_handler)

        logger.debug('=============   FOLD  {}  ============='.format(fold))
        logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour, now.minute, now.second))


        # Dataset

        param['batch size'] = max(param['batch size'], param['batch size'] * param['GPU'])
        if param['debug']:
            train_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid != @fold').iloc[:param['batch size']],
                                         augmentation=get_train_augmentation(*get_resolution(param['resolution'])),
                                         datadir=os.path.join(param['dataroot'],'train','imgs'), mode='train')

            valid_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid == @fold').iloc[:param['batch size']],
                                         augmentation=get_test_augmentation(*get_resolution(param['resolution'])),
                                         datadir=os.path.join(param['dataroot'],'train','imgs'), mode='valid')
        else:
            train_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid != @fold'),
                                         augmentation=get_train_augmentation(*get_resolution(param['resolution'])),
                                         datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='train',
                                         margin_augmentation=True)

            valid_dataset = AlconDataset(df=get_train_df(param['tabledir']).query('valid == @fold'),
                                         augmentation=get_test_augmentation(*get_resolution(param['resolution'])),
                                         datadir=os.path.join(param['dataroot'], 'train', 'imgs'), mode='valid',
                                         margin_augmentation=False)

        logger.debug('train dataset size: {}'.format(len(train_dataset)))
        logger.debug('valid dataset size: {}'.format(len(valid_dataset)))

        # Dataloader


        train_dataloader = DataLoader(train_dataset, batch_size=param['batch size'], num_workers=param['thread'],
                                      pin_memory=False, drop_last=False)
        valid_dataloader = DataLoader(valid_dataset, batch_size=param['batch size'], num_workers=param['thread'],
                                      pin_memory=False, drop_last=False)

        logger.debug('train loader size: {}'.format(len(train_dataloader)))
        logger.debug('valid loader size: {}'.format(len(valid_dataloader)))

        # model
        # model = ResNetGRU(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout'])
        # model = OctResNetGRU2(num_classes=48, hidden_size=512, bidirectional=True, load_weight=None, dropout=param['dropout'])
        # model = ResNetGRU3(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout'])
        # model = ResNetLSTM(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout'])
        # model = ResNetResLSTM_MLP(num_classes=48, hidden_size=512, bidirectional=True, load_weight=param['load weight'], dropout=param['dropout'])
        model = SEResNeXtGRU2(num_classes=48, hidden_size=512, bidirectional=True, load_weight=None, dropout=param['dropout'])



        param['model'] = model.__class__.__name__

        # optim
        if param['optim'].lower() == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=param['lr'], momentum=0.9,
                                        weight_decay=1e-5, nesterov=False)
        elif param['optim'].lower() == 'adam':
            optimizer = torch.optim.SGD(model.parameters(), lr=param['lr'])
        else:
            raise NotImplementedError

        # scheduler
        scheduler = eval(param['scheduler'])


        model = model.to(param['device'])
        if param['GPU'] > 0:
            model = nn.DataParallel(model)

        loss_fn = nn.CrossEntropyLoss().to(param['device'])
        eval_fn = accuracy_one_character

        max_char_acc = -1.
        max_3char_acc = -1.
        min_loss = 10**5


        writer = tbx.SummaryWriter("../log/exp{}/{}/fold{}".format(EXP_NO, now_date, fold))

        for key, val in param.items():
            writer.add_text('data/hyperparam/{}'.format(key), str(val), 0)


        mb = master_bar(range(param['epoch']))
        for epoch in mb:
            avg_train_loss, avg_train_accuracy, avg_three_train_acc = train_alcon_rnn(model, optimizer, train_dataloader, param['device'],
                                           loss_fn, eval_fn, epoch, scheduler=None, writer=writer, parent=mb) #ok

            avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn(model, valid_dataloader, param['device'],
                                                                                  loss_fn, eval_fn)

            writer.add_scalars("data/metric/valid", {
                'loss': avg_valid_loss,
                'accuracy': avg_valid_accuracy,
                '3accuracy': avg_three_valid_acc
            }, epoch)

            logger.debug('======================== epoch {} ========================'.format(epoch+1))
            logger.debug('lr              : {:.5f}'.format(scheduler.get_lr()[0]))
            logger.debug('loss            : train={:.5f}  , test={:.5f}'.format(avg_train_loss, avg_valid_loss))
            logger.debug('acc(per 1 char) : train={:.3%}  , test={:.3%}'.format(avg_train_accuracy, avg_valid_accuracy))
            logger.debug('acc(per 3 char) : train={:.3%}  , test={:.3%}'.format(avg_three_train_acc, avg_three_valid_acc))

            if min_loss > avg_valid_loss:
                logger.debug('update best loss:  {:.5f} ---> {:.5f}'.format(min_loss, avg_valid_loss))
                min_loss = avg_valid_loss
                torch.save(model.state_dict(), os.path.join(outdir, 'best_loss.pth'))

            if max_char_acc < avg_valid_accuracy:
                logger.debug('update best acc per 1 char:  {:.3%} ---> {:.3%}'.format(max_char_acc, avg_valid_accuracy))
                max_char_acc = avg_valid_accuracy
                torch.save(model.state_dict(), os.path.join(outdir, 'best_acc.pth'))

            if max_3char_acc < avg_three_valid_acc:
                logger.debug('update best acc per 3 char:  {:.3%} ---> {:.3%}'.format(max_3char_acc , avg_three_valid_acc))
                max_3char_acc = avg_three_valid_acc
                torch.save(model.state_dict(), os.path.join(outdir, 'best_3acc.pth'))

            if 1:
                if scheduler is not None:
                    if writer is not None:
                        writer.add_scalar("data/learning rate", scheduler.get_lr()[0], epoch)
                    scheduler.step()

        writer.add_scalars("data/metric/valid", {
            'best loss': min_loss,
            'best accuracy': max_char_acc,
            'best 3accuracy': max_3char_acc
        })

        logger.debug('================  FINISH  TRAIN  ================')
        logger.debug('Result')
        logger.debug('Best loss : {}'.format(min_loss))
        logger.debug('Best 1 acc : {}'.format(max_char_acc))
        logger.debug('Best 3 acc : {}'.format(max_3char_acc))
        writer.export_scalars_to_json(os.path.join(outdir, 'history.json'))
        writer.close()


        local_cv['fold{}'.format(fold)] = {'accuracy' : max_3char_acc, 'valid_size' : len(valid_dataset)}


        del train_dataset, valid_dataset
        del train_dataloader, valid_dataloader
        del scheduler, optimizer
        gc.collect()


        logger.debug('=========== Prediction phrase ===========')
        logger.debug('load weight  :  {}'.format(os.path.join(outdir, 'best_3acc.pth')))
        model.load_state_dict(torch.load(os.path.join(outdir, 'best_3acc.pth')))

        if param['debug']:
            test_dataset = AlconDataset(df=get_test_df(param['tabledir']).iloc[:param['batch size']],
                                        augmentation=get_test_augmentation(*get_resolution(param['resolution'])),
                                        datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test')
        else:
            test_dataset = AlconDataset(df=get_test_df(param['tabledir']),
                                        augmentation=get_test_augmentation(*get_resolution(param['resolution'])),
                                        datadir=os.path.join(param['dataroot'], 'test', 'imgs'), mode='test')


        test_dataloader = DataLoader(test_dataset, batch_size=param['batch size'], num_workers=param['thread'],
                                     pin_memory=False, drop_last=False)
        logger.debug('test dataset size: {}'.format(len(test_dataset)))
        logger.debug('test loader size: {}'.format(len(test_dataloader)))

        output_list = pred_alcon_rnn(model, test_dataloader, param['device'])
        torch.save(output_list, os.path.join(outdir, 'prediction.pth'))
        pd.DataFrame(output_list).drop('logit', axis=1).sort_values('ID').set_index('ID').to_csv(os.path.join(outdir, 'test_prediction.csv'))
        logger.debug('success!')
        logger.removeHandler(file_handler)

        del test_dataset, test_dataloader
        gc.collect()


    # Ensemble
    print('======== Ensemble phase =========')
    prediction_dict = dict()
    mb = master_bar(param['fold'])

    print('======== Load Vector =========')
    for i, fold in enumerate(mb):
        outdir = os.path.join(param['save path'], EXP_NAME, now_date,'fold{}'.format(fold))
        prediction = torch.load(os.path.join(outdir, 'prediction.pth'))
        # prediction is list
        # prediction[0] = {'ID' : 0, 'logit' torch.tensor, ...}
        if i == 0:
            for preds in progress_bar(prediction, parent=mb):
                prediction_dict[preds['ID']] = preds['logit'] / len(param['fold'])
        else:
            for preds in progress_bar(prediction, parent=mb):
                prediction_dict[preds['ID']] += preds['logit'] / len(param['fold'])

    outdir = os.path.join(param['save path'], EXP_NAME, now_date)

    file_handler = logging.FileHandler(os.path.join(outdir, 'result.log'))
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(handler_format)
    logger.addHandler(file_handler)
    logger.info(' ==========  RESULT  ========== \n')

    cv = 0.0
    train_data_size = 0
    for fold in param['fold']:
        acc = local_cv['fold{}'.format(fold)]['accuracy']
        valid_size = local_cv['fold{}'.format(fold)]['valid_size']
        train_data_size += valid_size
        logger.info(' fold {} :  {:.3%} \n'.format(fold, acc))
        cv += acc * valid_size
    logger.info(' Local CV : {:.3%} \n'.format(cv / train_data_size))
    logger.info(' ============================== \n')

    logger.removeHandler(file_handler)


    torch.save(prediction_dict, os.path.join(outdir, 'prediction.pth'))

    print('======== make submittion file =========')
    vocab = get_vocab(param['vocabdir'])
    submit_list = list()
    for ID, logits in progress_bar(prediction_dict.items()):
        submit_dict = dict()
        submit_dict["ID"] = ID
        preds = logits.softmax(dim=1).argmax(dim=1)
        submit_dict["Unicode1"] = vocab['index2uni'][preds[0]]
        submit_dict["Unicode2"] = vocab['index2uni'][preds[1]]
        submit_dict["Unicode3"] = vocab['index2uni'][preds[2]]
        submit_list.append(submit_dict)
    print()

    pd.DataFrame(submit_list).sort_values('ID').set_index('ID').to_csv(os.path.join(outdir, 'test_prediction.csv'))

    import zipfile
    with zipfile.ZipFile(os.path.join(outdir,'submit_{}_{}.zip'.format(EXP_NAME, now_date)), 'w') as zf:
        zf.write(os.path.join(outdir, 'test_prediction.csv'))

    print('success!')
Beispiel #3
0
    def _custom_train(
        self,
        train_dataset,
        tokenizer,
        model,
        num_train_examples,
        train_batch_size,
    ):
        config = self.parent.config._asdict()
        config["strategy"] = self.parent.config.strategy
        config["n_device"] = self.parent.config.n_device
        labels = config["ner_tags"]
        if config["max_steps"] > 0:
            num_train_steps = (
                config["max_steps"] * config["gradient_accumulation_steps"]
            )
            config["epochs"] = 1
        else:
            num_train_steps = (
                math.ceil(num_train_examples / train_batch_size)
                // config["gradient_accumulation_steps"]
                * config["epochs"]
            )

        with config["strategy"].scope():
            loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(
                reduction=tf.keras.losses.Reduction.NONE
            )
            optimizer = create_optimizer(
                config["learning_rate"],
                num_train_steps,
                config["warmup_steps"],
            )

            if config["use_fp16"]:
                optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                    optimizer, "dynamic"
                )

            loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
            gradient_accumulator = GradientAccumulator()

        self.logger.info("***** Running training *****")
        self.logger.info("  Num examples = %d", num_train_examples)
        self.logger.info("  Num Epochs = %d", config["epochs"])
        self.logger.info(
            "  Instantaneous batch size per device = %d",
            config["per_device_train_batch_size"],
        )
        self.logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            train_batch_size * config["gradient_accumulation_steps"],
        )
        self.logger.info(
            "  Gradient Accumulation steps = %d",
            config["gradient_accumulation_steps"],
        )
        self.logger.info("  Total training steps = %d", num_train_steps)

        self.logger.debug(model.summary())

        @tf.function
        def apply_gradients():
            grads_and_vars = []

            for gradient, variable in zip(
                gradient_accumulator.gradients, model.trainable_variables
            ):
                if gradient is not None:
                    scaled_gradient = gradient / (
                        config["n_device"]
                        * config["gradient_accumulation_steps"]
                    )
                    grads_and_vars.append((scaled_gradient, variable))
                else:
                    grads_and_vars.append((gradient, variable))

            optimizer.apply_gradients(grads_and_vars, config["max_grad_norm"])
            gradient_accumulator.reset()

        @tf.function
        def train_step(train_features, train_labels):
            def step_fn(train_features, train_labels):
                inputs = {
                    "attention_mask": train_features["input_mask"],
                    "training": True,
                }

                if config["model_architecture_type"] != "distilbert":
                    inputs["token_type_ids"] = (
                        train_features["segment_ids"]
                        if config["model_architecture_type"]
                        in ["bert", "xlnet"]
                        else None
                    )

                with tf.GradientTape() as tape:
                    logits = model(train_features["input_ids"], **inputs)[0]
                    logits = tf.reshape(logits, (-1, len(labels) + 1))
                    active_loss = tf.reshape(
                        train_features["input_mask"], (-1,)
                    )
                    active_logits = tf.boolean_mask(logits, active_loss)
                    train_labels = tf.reshape(train_labels, (-1,))
                    active_labels = tf.boolean_mask(train_labels, active_loss)
                    cross_entropy = loss_fct(active_labels, active_logits)
                    loss = tf.reduce_sum(cross_entropy) * (
                        1.0 / train_batch_size
                    )
                    grads = tape.gradient(loss, model.trainable_variables)

                    gradient_accumulator(grads)

                return cross_entropy

            per_example_losses = config["strategy"].experimental_run_v2(
                step_fn, args=(train_features, train_labels)
            )
            mean_loss = config["strategy"].reduce(
                tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0
            )

            return mean_loss

        current_time = datetime.datetime.now()
        train_iterator = master_bar(range(config["epochs"]))
        global_step = 0
        self.logger_loss = 0.0

        for epoch in train_iterator:
            epoch_iterator = progress_bar(
                train_dataset,
                total=num_train_steps,
                parent=train_iterator,
                display=config["n_device"] > 1,
            )
            step = 1

            with config["strategy"].scope():
                for train_features, train_labels in epoch_iterator:
                    loss = train_step(train_features, train_labels)

                    if step % config["gradient_accumulation_steps"] == 0:
                        config["strategy"].experimental_run_v2(apply_gradients)
                        loss_metric(loss)
                        global_step += 1
                        if (
                            config["save_steps"] > 0
                            and global_step % config["save_steps"] == 0
                        ):
                            # Save model checkpoint
                            output_dir = os.path.join(
                                config["output_dir"],
                                "checkpoint-{}".format(global_step),
                            )

                            if not os.path.exists(output_dir):
                                os.makedirs(output_dir)

                            model.save_pretrained(output_dir)
                            self.logger.info(
                                "Saving model checkpoint to %s", output_dir
                            )

                    train_iterator.child.comment = (
                        f"loss : {loss_metric.result()}"
                    )
                    step += 1

            train_iterator.write(
                f"loss epoch {epoch + 1}: {loss_metric.result()}"
            )
            loss_metric.reset_states()
        self.logger.debug(
            "  Training took time = {}".format(
                datetime.datetime.now() - current_time
            )
        )
 def begin_fit(self):
     self.mbar = master_bar(range(self.epochs))
     self.mbar.on_iter_begin()
     self.run.logger = partial(self.mbar.write, table=True)
Beispiel #5
0
def train(model,
          epochs,
          learning_rates,
          optimizer,
          criterion,
          dataset,
          batch_size=512,
          num_workers=0,
          drop_last=False,
          timer=None):

    t = timer or Timer()

    train_batches = get_dataloader["train"]
    if balance:
        train_batches = torch.utils.data.DataLoader(
            dataset["train"],
            batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=num_workers,
            drop_last=drop_last,
            sampler=dataset["train"].get_balanced_sampler())
    else:
        train_batches = torch.utils.data.DataLoader(dataset["train"],
                                                    batch_size,
                                                    shuffle=True,
                                                    pin_memory=True,
                                                    num_workers=num_workers,
                                                    drop_last=drop_last)
    test_batches = torch.utils.data.DataLoader(dataset["val"],
                                               batch_size,
                                               shuffle=False,
                                               pin_memory=True,
                                               num_workers=num_workers)
    train_size, val_size = len(dataset["train"]), len(dataset["val"])
    if drop_last: train_size -= (train_size % batch_size)

    num_epochs = epochs[-1]
    lr_schedule = LinearInterpolation(epochs, learning_rates)
    #mo_schedule   = LinearInterpolation(epochs, momentum)

    mb = master_bar(range(num_epochs))
    mb.write("Epoch\tTime\tLearRate\tT_loss\tT_accu\t\tV_loss\tV_accu")
    mb.write("-" * 70)
    for epoch in mb:

        #train_batches.dataset.set_random_choices()
        lrs = (lr_schedule(x) / batch_size
               for x in np.arange(epoch, epoch + 1, 1 / len(train_batches)))
        train_stats, train_time = train_epoch(mb, model, train_batches,
                                              optimizer, criterion, lrs, {
                                                  'loss': [],
                                                  'correct': []
                                              }), t()
        test_stats, test_time = test_epoch(mb, model, test_batches, criterion,
                                           {
                                               'loss': [],
                                               'correct': []
                                           }), t()

        metric["epoch"].append(epoch + 1)
        metric["learning rate"].append(lr_schedule(epoch + 1))
        metric["total time"].append(t.total_time)
        metric["train loss"].append(sum(train_stats['loss']) / train_size)
        metric["train acc"].append(sum(train_stats['correct']) / train_size)
        metric["val loss"].append(sum(test_stats['loss']) / val_size)
        metric["val acc"].append(sum(test_stats['correct']) / val_size)

        mb.write(
            "{}/{}\t{:.0f}:{:.0f}\t{:.4f}\t\t{:.4f}\t{:.4f}\t\t{:.4f}\t{:.4f}".
            format(metric["epoch"][-1], num_epochs,
                   metric["total time"][-1] // 60,
                   metric["total time"][-1] % 60, metric["learning rate"][-1],
                   metric["train loss"][-1], metric["train acc"][-1],
                   metric["val loss"][-1], metric["val acc"][-1]))
        graphs = [[metric["epoch"], metric["train acc"]],
                  [metric["epoch"], metric["val acc"]]]
        mb.update_graph(graphs)

    return metric
Beispiel #6
0
  def fit(self, X, y, epochs=100, validation_data=None, batch_size=32, verbose=True, early_stopping=False, trans=None, validation_trans=None):

    dataset = self.dataset(X, y, trans=trans)
    dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
    self.history = {"loss": []}
    for metric in self.metrics:
      self.history[f'{metric.name}'] = []

    if validation_data:
      dataset = self.dataset(validation_data[0], validation_data[1], trans=validation_trans)
      dataloader_val = DataLoader(dataset, shuffle=False, batch_size=batch_size)
      self.history["val_loss"] = []
      for metric in self.metrics:
        self.history[f'val_{metric.name}'] = []
    
    if self.scheduler:
      self.history["lr"] = []
    
    self.net.to(self.device)
    mb = master_bar(range(1, epochs+1))
    best_loss, step, best_e = 1e10, 0, 0
    for epoch in mb:
      # train
      self.net.train()
      train_loss, train_metrics = [], [[] for m in self.metrics]
      for X, y in progress_bar(dataloader, parent=mb):
        X, y = X.to(self.device), y.to(self.device)  
        self.optimizer.zero_grad()
        output = self.net(X)
        loss = self.loss(output, y)
        loss.backward()
        self.optimizer.step()
        train_loss.append(loss.item())
        comment = f'train_loss {np.mean(train_loss):.5f}'
        for i, metric in enumerate(self.metrics):
          train_metrics[i].append(metric.call(output, y))
          comment += f' train_{metric.name} {np.mean(train_metrics[i]):.5f}'
        mb.child.comment = comment
      self.history["loss"].append(np.mean(train_loss))
      for i, metric in enumerate(self.metrics):
        self.history[f'{metric.name}'].append(np.mean(train_metrics[i]))
      bar_text = f'Epoch {epoch}/{epochs} loss {np.mean(train_loss):.5f}'
      for i, metric in enumerate(self.metrics):
        bar_text += f' {metric.name} {np.mean(train_metrics[i]):.5f}'
      if self.scheduler:
        self.history["lr"].append(optimizer.param_groups[0]['lr'])
        self.scheduler.step()
      # eval
      if validation_data:
        self.net.eval()
        val_loss, val_metrics = [], [[] for m in self.metrics]
        with torch.no_grad():
          for X, y in progress_bar(dataloader_val, parent=mb):
            X, y = X.to(self.device), y.to(self.device)  
            output = self.net(X)
            loss = self.loss(output, y)
            val_loss.append(loss.item())
            comment = f'val_loss {np.mean(val_loss):.5f}'
            for i, metric in enumerate(self.metrics):
              val_metrics[i].append(metric.call(output, y))
              comment += f' val_{metric.name} {np.mean(val_metrics[i]):.5f}'
            mb.child.comment = comment      
        self.history["val_loss"].append(np.mean(val_loss))
        for i, metric in enumerate(self.metrics):
          self.history[f'val_{metric.name}'].append(np.mean(val_metrics[i]))
        bar_text += f' val_loss {np.mean(val_loss):.5f}'
        for i, metric in enumerate(self.metrics):
          bar_text += f' val_{metric.name} {np.mean(val_metrics[i]):.5f}'
        if early_stopping:
          step += 1
          if np.mean(val_loss) < best_loss:
            best_loss = np.mean(val_loss)
            torch.save(self.net.state_dict(),'best_dict.pth')
            best_e = epoch
            step = 0
          if step >= early_stopping:
            self.net.load_state_dict(torch.load('best_dict.pth'))
            print(f"training stopped at epoch {epoch}")
            print(f"best model found at epoch {best_e} with val_loss {best_loss:.5f}")
            break
      if verbose:
        mb.write(bar_text)

    return self.history
Beispiel #7
0
def main(out: Param("dataset folder", Path, required=True),
         info: Param('info file', Path, required=True),
         tile: Param('generated tile size', int, nargs='+', required=True),
         n_train: Param('number of train tiles', int, required=True),
         n_valid: Param('number of validation tiles', int, required=True),
         crap_func: Param('crappifier name', str) = 'no_crap',
         n_frames: Param('number of frames', int) = 1,
         lr_type: Param('training input, (s)ingle, (t) multi or (z) multi',
                        str) = 's',
         scale: Param('amount to scale', int) = 4,
         ftypes: Param('ftypes allowed e.g. - czi, tif', str,
                       nargs='+') = None,
         upsample: Param('use upsample', action='store_true') = False,
         only: Param('limit to these categories', nargs='+') = None,
         skip: Param("categories to skip", str,
                     nargs='+') = ['random', 'ArgoSIMDL'],
         clean: Param("wipe existing data first",
                      action='store_true') = False):
    "generate tiles from source tiffs"
    up = 'up' if upsample else ''

    if lr_type not in ['s', 't', 'z']:
        print('lr_type should be s, t or z')
        return 1

    if lr_type == 's':
        z_frames, t_frames = 1, 1
    elif lr_type == 't':
        z_frames, t_frames = 1, n_frames
    elif lr_type == 'z':
        z_frames, t_frames = n_frames, 1

    out = ensure_folder(out / f'{lr_type}_{n_frames}_{info.stem}_{crap_func}')
    if clean:
        shutil.rmtree(out)

    crap_func = eval(crap_func)
    if not crap_func is None:
        if not callable(crap_func):
            print('crap_func is not callable')
            crap_func = None
        else:
            crap_func = partial(crap_func, scale=scale, upsample=upsample)

    info = pd.read_csv(info)

    if ftypes: info = info.loc[info.ftype.isin(ftypes)]
    if only: info = info.loc[info.category.isin(only)]
    elif skip: info = info.loc[~info.category.isin(skip)]

    info = info.loc[info.nz >= z_frames]
    info = info.loc[info.nt >= t_frames]

    tile_infos = []
    for mode, n_samples in [('train', n_train), ('valid', n_valid)]:
        mode_info = info.loc[info.dsplit == mode]
        categories = list(mode_info.groupby('category'))
        files_by_category = {
            c: list(info.groupby('fn'))
            for c, info in categories
        }

        for i in range(n_samples):
            category, cat_df = random.choice(categories)
            fn, item_df = random.choice(files_by_category[category])
            legal_choices = [
                item_info for ix, item_info in item_df.iterrows()
                if check_info(item_info, t_frames, z_frames)
            ]

            assert (legal_choices)
            item_info = random.choice(legal_choices)
            for tile_sz in tile:
                item_d = dict(item_info)
                item_d['tile_sz'] = tile_sz
                tile_infos.append(item_d)

    tile_info_df = pd.DataFrame(tile_infos).reset_index()
    print('num tile pulls:', len(tile_infos))
    print(tile_info_df.groupby('category').fn.count())

    last_stat = None
    tile_pull_info = []
    tile_puller = None

    multi_str = f'_{lr_type}_{n_frames}' if lr_type != 's' else ''
    mbar = master_bar(tile_info_df.groupby('fn'))
    for fn, tile_stats in mbar:
        if Path(
                fn
        ).stem == 'high res microtubules for testing before stitching - great quality':
            continue
        for i, tile_stat in progress_bar(list(tile_stats.iterrows()),
                                         parent=mbar):
            try:
                mode = tile_stat['dsplit']
                category = tile_stat['category']
                tile_sz = tile_stat['tile_sz']
                tile_folder = ensure_folder(
                    out / f'hr_t_{tile_sz}{multi_str}' / mode / category)
                if crap_func:
                    crap_folder = ensure_folder(
                        out / f'lr{up}_t_{tile_sz}{multi_str}' / mode /
                        category)
                else:
                    crap_folder = None

                if need_cache_flush(tile_stat, last_stat):
                    if tile_puller:
                        tile_puller(None, None, None, close_me=True)
                    last_stat = tile_stat.copy()
                    tile_sz = tile_stat['tile_sz']
                    tile_puller = get_tile_puller(tile_stat, crap_func,
                                                  t_frames, z_frames)
                tile_pull_info.append(
                    tile_puller(tile_stat, tile_folder, crap_folder))
            except MemoryError as error:
                # some files are too big to read
                fn = Path(tile_stat['fn'])
                print(f'too big: {fn.stem}')

    pd.DataFrame(tile_pull_info).to_csv(out / f'tiles{multi_str}.csv',
                                        index=False)
Beispiel #8
0
 def begin_fit(self):
     self.master_bar = master_bar(range(self.epochs))
     self.master_bar.on_iter_begin()
     # Callback class stores the Learner() object under self.run
     self.run.logger = partial(self.master_bar.write, table=True)
Beispiel #9
0
    def train(self, num_epochs, max_lr=0.1):

        t = Timer()

        valid_loss_min = np.Inf
        patience = 10
        p = 0  # current number of epochs, where validation loss didn't increase
        #train_size, val_size = len(self.train_ds), len(self.valid_ds)
        #if drop_last: train_size -= (train_size % self.batch_size)

        self.epochs = [0, num_epochs / 4, num_epochs]  #[0, 15, 30, 35]
        self.learning_rates = [0, max_lr, 0]  #[0, 0.1, 0.005, 0]
        lr_schedule = LinearInterpolation(self.epochs, self.learning_rates)
        #mo_schedule   = LinearInterpolation(epochs, momentum)

        stats = {
            "train_it": [],
            'train_loss': [],
            'train_metric': [],
            "valid_it": [],
            'valid_loss': [],
            'valid_metric': []
        }

        mb = master_bar(range(num_epochs))
        mb.names = ["train loss", "train acc", "val loss", "val acc"]
        mb.write("Epoch\tTime\tLearRate\tT_loss\tT_accu\t\tV_loss\tV_accu")
        mb.write("-" * 70)
        for epoch in mb:

            mb.write("epoch")

            #self.train_batches.dataset.set_random_choices()
            lrs = (lr_schedule(x) / self.batch_size
                   for x in np.arange(epoch, epoch + 1, 1 /
                                      len(self.train_batches)))
            stats, train_time = self.train_epoch(stats, epoch, mb, lrs), t()
            stats, valid_time = self.valid_epoch(
                stats,
                epoch,
                mb,
            ), t()

            self.log["epoch"].append(epoch + 1)
            self.log["learning rate"].append(lr_schedule(epoch + 1))
            self.log["total time"].append(t.total_time)
            self.log["train loss"].append(np.mean(
                stats['train_loss']))  # or np.mean
            self.log["train acc"].append(np.mean(stats['train_metric']))
            self.log["val loss"].append(np.mean(stats['valid_loss']))
            self.log["val acc"].append(np.mean(stats['valid_metric']))

            if self.log["val loss"][-1] <= valid_loss_min:  # Val loss improve
                mb.write('Saving model!')
                self.save_model()
                valid_loss_min = self.log["val loss"][-1]
                p = 0
            else:  # Val loss didn't improve
                p += 1
                if p > patience:
                    mb.write('Stopping training')
                    break

            mb.write(
                "{}/{}\t{:.0f}:{:.0f}\t{:.4f}\t\t{:.4f}\t{:.4f}\t\t{:.4f}\t{:.4f}"
                .format(self.log["epoch"][-1], num_epochs,
                        self.log["total time"][-1] // 60,
                        self.log["total time"][-1] % 60,
                        self.log["learning rate"][-1],
                        self.log["train loss"][-1], self.log["train acc"][-1],
                        self.log["val loss"][-1], self.log["val acc"][-1]))

            #graphs = [[self.log["epoch"], self.log["train acc"]],
            #          [self.log["epoch"], self.log["val acc"]]]
            #mb.update_graph(graphs)

            torch.cuda.empty_cache()  # free cache mem after train
    def training(self):
        condition = self.config["train"]["condition"]
        best_score = {"epoch": -1, "train_loss": np.inf, "valid_loss": np.inf, "train_qwk": 0.0, "valid_qwk": 0.0}

        non_improvement_round = 0
        mb = master_bar(range(condition["epoch"]))
        for epoch in mb:

            temp_score = {"epoch": epoch, "train_loss": 0.0, "valid_loss": 0.0, "train_qwk": 0.0, "valid_qwk": 0.0}
            for phase in ["train", "valid"]:
                if phase == "train":
                    data_loader = self.train_loader
                    self.scheduler.step()
                    self.model.train()
                elif phase == "valid":
                    data_loader = self.valid_loader
                    self.model.eval()

                running_loss = 0.0
                y_true, y_pred = np.array([]).reshape((0, 1)), np.array([]).reshape((0, 1))
                for data in progress_bar(data_loader, parent=mb):
                    mb.child.comment = ">> {} phase".format(phase)
                    inputs = data["image"].to(self.device, dtype=torch.float)
                    labels = data["label"].view(-1, 1).to(self.device, dtype=torch.float)
                    self.optimizer.zero_grad()
                    outputs = self.model(inputs)

                    with torch.set_grad_enabled(phase == "train"):
                        loss = self.criterion(outputs, labels)
                        if phase == "train":
                            loss.backward()
                            self.optimizer.step()
                    running_loss += loss.item()

                    if torch.cuda.is_available():
                        labels = labels.cpu()
                        outputs = outputs.cpu()
                    y_true = np.vstack((y_true, labels.detach().numpy()))
                    y_pred = np.vstack((y_pred, outputs.detach().numpy()))
                temp_score["{}_loss".format(phase)] = running_loss / len(data_loader)
                temp_score["{}_qwk".format(phase)] = self.__auc_scoring(y_true, y_pred)

            super().update_training_log(temp_score)

            if best_score["valid_loss"] > temp_score["valid_loss"]:
                best_score = temp_score
                super().update_best_model(self.model.state_dict())
                non_improvement_round = 0
            else:
                non_improvement_round += 1

            if epoch % 10 == 0:
                text = "[epoch {}] best epoch:{}  train loss:{}  valid loss:{}  train auc:{}  valid auc:{}".format(
                    epoch,
                    best_score["epoch"],
                    np.round(best_score["train_loss"], 5),
                    np.round(best_score["valid_loss"], 5),
                    np.round(best_score["train_qwk"], 5),
                    np.round(best_score["valid_qwk"], 5)
                )
                mb.write(text)
                super().update_learning_curve()

            # Early Stopping
            if non_improvement_round >= condition["early_stopping_rounds"]:
                print("\t Early stopping: {}[epoch]".format(epoch))
                break

        super().update_learning_curve()
        return best_score
def train(
    args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
):
    if args["max_steps"] > 0:
        num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
        args["num_train_epochs"] = 1
    else:
        num_train_steps = (
            math.ceil(num_train_examples / train_batch_size)
            // args["gradient_accumulation_steps"]
            * args["num_train_epochs"]
        )

    writer = tf.summary.create_file_writer("/tmp/mylogs")

    with strategy.scope():
        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])

        if args["fp16"]:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")

        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
        gradient_accumulator = GradientAccumulator()

    logging.info("***** Running training *****")
    logging.info("  Num examples = %d", num_train_examples)
    logging.info("  Num Epochs = %d", args["num_train_epochs"])
    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
    logging.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        train_batch_size * args["gradient_accumulation_steps"],
    )
    logging.info("  Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
    logging.info("  Total training steps = %d", num_train_steps)

    model.summary()

    @tf.function
    def apply_gradients():
        grads_and_vars = []

        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
            if gradient is not None:
                scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
                grads_and_vars.append((scaled_gradient, variable))
            else:
                grads_and_vars.append((gradient, variable))

        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
        gradient_accumulator.reset()

    @tf.function
    def train_step(train_features, train_labels):
        def step_fn(train_features, train_labels):
            inputs = {"attention_mask": train_features["input_mask"], "training": True}

            if args["model_type"] != "distilbert":
                inputs["token_type_ids"] = (
                    train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
                )

            with tf.GradientTape() as tape:
                logits = model(train_features["input_ids"], **inputs)[0]
                logits = tf.reshape(logits, (-1, len(labels) + 1))
                active_loss = tf.reshape(train_features["input_mask"], (-1,))
                active_logits = tf.boolean_mask(logits, active_loss)
                train_labels = tf.reshape(train_labels, (-1,))
                active_labels = tf.boolean_mask(train_labels, active_loss)
                cross_entropy = loss_fct(active_labels, active_logits)
                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
                grads = tape.gradient(loss, model.trainable_variables)

                gradient_accumulator(grads)

            return cross_entropy

        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)

        return mean_loss

    current_time = datetime.datetime.now()
    train_iterator = master_bar(range(args["num_train_epochs"]))
    global_step = 0
    logging_loss = 0.0

    for epoch in train_iterator:
        epoch_iterator = progress_bar(
            train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
        )
        step = 1

        with strategy.scope():
            for train_features, train_labels in epoch_iterator:
                loss = train_step(train_features, train_labels)

                if step % args["gradient_accumulation_steps"] == 0:
                    strategy.experimental_run_v2(apply_gradients)

                    loss_metric(loss)

                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
                        # Log metrics
                        if (
                            args["n_device"] == 1 and args["evaluate_during_training"]
                        ):  # Only evaluate when single GPU otherwise metrics may not average well
                            y_true, y_pred, eval_loss = evaluate(
                                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
                            )
                            report = metrics.classification_report(y_true, y_pred, digits=4)

                            logging.info("Eval at step " + str(global_step) + "\n" + report)
                            logging.info("eval_loss: " + str(eval_loss))

                            precision = metrics.precision_score(y_true, y_pred)
                            recall = metrics.recall_score(y_true, y_pred)
                            f1 = metrics.f1_score(y_true, y_pred)

                            with writer.as_default():
                                tf.summary.scalar("eval_loss", eval_loss, global_step)
                                tf.summary.scalar("precision", precision, global_step)
                                tf.summary.scalar("recall", recall, global_step)
                                tf.summary.scalar("f1", f1, global_step)

                        lr = optimizer.learning_rate
                        learning_rate = lr(step)

                        with writer.as_default():
                            tf.summary.scalar("lr", learning_rate, global_step)
                            tf.summary.scalar(
                                "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
                            )

                        logging_loss = loss_metric.result()

                    with writer.as_default():
                        tf.summary.scalar("loss", loss_metric.result(), step=step)

                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
                        # Save model checkpoint
                        output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)

                        model.save_pretrained(output_dir)
                        logging.info("Saving model checkpoint to %s", output_dir)

                train_iterator.child.comment = f"loss : {loss_metric.result()}"
                step += 1

        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")

        loss_metric.reset_states()

    logging.info("  Training took time = {}".format(datetime.datetime.now() - current_time))
Beispiel #12
0
def main(config,
         pretrained=False,
         patience=1,
         lr_scale=1.,
         pretrained_path=None,
         var_neighbor=5,
         random_neighbor=True,
         netname='att-gcn'):
    # Instantiate the network
    #     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    #     os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_id)
    net = nn.DataParallel(ResidualGatedGCNModel(config, dtypeFloat, dtypeLong))
    if torch.cuda.is_available():
        net.cuda()
    if pretrained:
        if pretrained_path is not None:
            log_dir = pretrained_path
            if torch.cuda.is_available():
                checkpoint = torch.load(log_dir)
            net.load_state_dict(checkpoint['model_state_dict'])
        else:
            log_dir = f"./tsp-models/{config.expt_name}/"
            if torch.cuda.is_available():
                checkpoint = torch.load(log_dir + "best_val_checkpoint.tar")
            net.load_state_dict(checkpoint['model_state_dict'])
    print(net)

    # Compute number of network parameters
    nb_param = 0
    for param in net.parameters():
        nb_param += np.prod(list(param.data.size()))
    print('Number of parameters:', nb_param)

    # Create log directory
    tmp_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    tmp_time = '{}-{}'.format(netname, tmp_time)
    log_dir = f"./logs/{config.expt_name}/{tmp_time}/"
    os.makedirs(log_dir, exist_ok=True)
    json.dump(config, open(f"{log_dir}/config.json", "w"), indent=4)
    writer = SummaryWriter(log_dir)  # Define Tensorboard writer

    # Training parameters
    #batch_size = config.batch_size
    #batches_per_epoch = config.batches_per_epoch
    #accumulation_steps = config.accumulation_steps
    #num_nodes = config.num_nodes
    #num_neighbors = config.num_neighbors
    max_epochs = config.max_epochs
    val_every = config.val_every
    test_every = config.test_every
    learning_rate = config.learning_rate * lr_scale
    decay_rate = config.decay_rate
    num_patience = 0
    val_loss_old = 1e6  # For decaying LR based on validation loss
    val_loss_best = 1e6
    best_pred_tour_len = 1e6  # For saving checkpoints

    # Define optimizer
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
    #     optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate,
    #                                 momentum=0.9, weight_decay=0.0005)
    print(optimizer)

    epoch_bar = master_bar(range(max_epochs))
    for epoch in epoch_bar:
        # Log to Tensorboard
        if random_neighbor:
            if epoch % var_neighbor == 0:
                num_neighbors = np.random.choice(config.num_neighbors)
        else:
            num_neighbors = config.num_neighbors
        writer.add_scalar('learning_rate', learning_rate, epoch)

        # Train
        train_time, train_loss, train_err_edges, train_err_tour, train_err_tsp, train_pred_tour_len, train_gt_tour_len = train_one_epoch(
            net, optimizer, config, epoch_bar, num_neighbors)
        epoch_bar.write('t: ' + metrics_to_str(
            epoch, train_time, learning_rate, train_loss, train_err_edges,
            train_err_tour, train_err_tsp, train_pred_tour_len,
            train_gt_tour_len, num_neighbors))
        writer.add_scalar('loss/train_loss', train_loss, epoch)
        writer.add_scalar('pred_tour_len/train_pred_tour_len',
                          train_pred_tour_len, epoch)
        writer.add_scalar('optimality_gap/train_opt_gap',
                          train_pred_tour_len / train_gt_tour_len - 1, epoch)

        if epoch % val_every == 0 or epoch == max_epochs - 1:
            # Validate
            val_time, val_loss, val_err_edges, val_err_tour, val_err_tsp, val_pred_tour_len, val_gt_tour_len = test(
                net,
                config,
                epoch_bar,
                mode='val',
                num_neighbors=num_neighbors)
            epoch_bar.write('v: ' + metrics_to_str(
                epoch, val_time, learning_rate, val_loss, val_err_edges,
                val_err_tour, val_err_tsp, val_pred_tour_len, val_gt_tour_len,
                num_neighbors))
            writer.add_scalar('loss/val_loss', val_loss, epoch)
            writer.add_scalar('pred_tour_len/val_pred_tour_len',
                              val_pred_tour_len, epoch)
            writer.add_scalar('optimality_gap/val_opt_gap',
                              val_pred_tour_len / val_gt_tour_len - 1, epoch)

            # Save checkpoint
            if val_pred_tour_len < best_pred_tour_len:
                best_pred_tour_len = val_pred_tour_len  # Update best prediction
                torch.save(
                    {
                        'epoch': epoch,
                        'model_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'train_loss': train_loss,
                        'val_loss': val_loss,
                    }, log_dir + "best_val_checkpoint_{}.tar".format(epoch))

            # Update learning rate
            if val_loss > 0.99 * val_loss_old:
                learning_rate /= decay_rate
                optimizer = update_learning_rate(optimizer, learning_rate)

            val_loss_old = val_loss  # Update old validation loss
            # Early Stopping
            if val_loss_best > val_loss:
                num_patience = 0
                val_loss_best = val_loss
            else:
                num_patience += 1

#         if epoch % test_every == 0 or epoch == max_epochs-1:
#             # Test
#             test_time, test_loss, test_err_edges, test_err_tour, test_err_tsp, test_pred_tour_len, test_gt_tour_len = test(net, config, epoch_bar, mode='test')
#             epoch_bar.write('T: ' + metrics_to_str(epoch, test_time, learning_rate, test_loss, test_err_edges, test_err_tour, test_err_tsp, test_pred_tour_len, test_gt_tour_len))
#             writer.add_scalar('loss/test_loss', test_loss, epoch)
#             writer.add_scalar('pred_tour_len/test_pred_tour_len', test_pred_tour_len, epoch)
#             writer.add_scalar('optimality_gap/test_opt_gap', test_pred_tour_len/test_gt_tour_len - 1, epoch)

# Save training checkpoint at the end of epoch
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'val_loss': val_loss,
            }, log_dir + "last_train_checkpoint.tar")

        # Save checkpoint after every 250 epochs
        if epoch != 0 and (epoch % 250 == 0 or epoch == max_epochs - 1):
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': net.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'train_loss': train_loss,
                    'val_loss': val_loss,
                }, log_dir + f"checkpoint_epoch{epoch}.tar")
        if num_patience >= patience:
            break

    return net
Beispiel #13
0
 def begin_fit(self):
     self.mbar = master_bar(range(self.epochs))
     self.trainer.logger = partial(self.mbar.write, table=True)
     return
Beispiel #14
0
def train_vcae(n_epochs,
               model,
               train_iterator,
               val_iterator,
               optimizer,
               device,
               criterion,
               save_best=True,
               verbose=True,
               is_nf=False,
               nf=None):
    model_name = 'NormalizingFlow' + model.__class__.__name__ if is_nf else model.__class__.__name__
    writer, experiment_name, best_model_path = setup_experiment(model_name,
                                                                log_dir="./tb")

    mb = master_bar(range(n_epochs))

    train_losses, val_losses = [], []
    best_val_loss = float('+inf')

    for epoch in mb:
        train_loss = run_epoch(model,
                               train_iterator,
                               optimizer,
                               criterion,
                               mb,
                               phase='train',
                               epoch=epoch,
                               writer=writer,
                               is_nf=is_nf,
                               nf=nf,
                               device=device)

        val_loss = run_epoch(model,
                             val_iterator,
                             None,
                             criterion,
                             mb,
                             phase='val',
                             epoch=epoch,
                             writer=writer,
                             is_nf=is_nf,
                             nf=nf,
                             device=device)

        # save logs
        dict_saver = {}
        dict_saver.update({'train_loss_mean': train_loss})
        dict_saver.update({'test_loss_mean': val_loss})
        file_to_save_path = ''.join(
            [LOG_PATH, FILE_NAME, experiment_name, FILE_EXCITON])
        save_to_file(file_to_save_path, dict_saver)

        # save the best model
        if save_best and (val_loss < best_val_loss):
            best_val_loss = val_loss
            save_model(nf if is_nf else model, best_model_path)

        if verbose:
            # append to a list for real-time plotting
            train_losses.append(train_loss)
            val_losses.append(val_loss)

            # start plotting for notebook
            mb.main_bar.comment = f'EPOCHS, best_loss:{best_val_loss}'
            mb.child.comment = f"train_loss:{round(train_loss, 3)}, val_loss:{round(val_loss, 3)}"
            plot_loss_update(epoch, n_epochs, mb, train_losses, val_losses)

    return best_model_path
Beispiel #15
0
def train(region):
    np.random.seed(0)
    torch.manual_seed(0)

    input_len = 10
    encoder_units = 32
    decoder_units = 64
    encoder_rnn_layers = 3
    encoder_dropout = 0.2
    decoder_dropout = 0.2
    input_size = 2
    output_size = 1
    predict_len = 5
    batch_size = 16
    epochs = 500
    force_teacher = 0.8

    train_dataset, test_dataset, train_max, train_min = create_dataset(
        input_len, predict_len, region)
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

    enc = Encoder(input_size, encoder_units, input_len,
                  encoder_rnn_layers, encoder_dropout)
    dec = Decoder(encoder_units*2, decoder_units, input_len,
                  input_len, decoder_dropout, output_size)

    optimizer = AdaBound(list(enc.parameters()) +
                         list(dec.parameters()), 0.01, final_lr=0.1)
    # optimizer = optim.Adam(list(enc.parameters()) + list(dec.parameters()), 0.01)
    criterion = nn.MSELoss()

    mb = master_bar(range(epochs))
    for ep in mb:
        train_loss = 0
        enc.train()
        dec.train()
        for encoder_input, decoder_input, target in progress_bar(train_loader, parent=mb):
            optimizer.zero_grad()
            enc_vec = enc(encoder_input)
            h = enc_vec[:, -1, :]
            _, c = dec.initHidden(batch_size)
            x = decoder_input[:, 0]
            pred = []
            for pi in range(predict_len):
                x, h, c = dec(x, h, c, enc_vec)
                rand = np.random.random()
                pred += [x]
                if rand < force_teacher:
                    x = decoder_input[:, pi]
            pred = torch.cat(pred, dim=1)
            # loss = quantile_loss(pred, target)
            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        test_loss = 0
        enc.eval()
        dec.eval()
        for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb):
            with torch.no_grad():
                enc_vec = enc(encoder_input)
                h = enc_vec[:, -1, :]
                _, c = dec.initHidden(batch_size)
                x = decoder_input[:, 0]
                pred = []
                for pi in range(predict_len):
                    x, h, c = dec(x, h, c, enc_vec)
                    pred += [x]
                pred = torch.cat(pred, dim=1)
            # loss = quantile_loss(pred, target)
            loss = criterion(pred, target)
            test_loss += loss.item()
        print(
            f"Epoch {ep} Train Loss {train_loss/len(train_loader)} Test Loss {test_loss/len(test_loader)}")

    if not os.path.exists("models"):
        os.mkdir("models")
    torch.save(enc.state_dict(), f"models/{region}_enc.pth")
    torch.save(dec.state_dict(), f"models/{region}_dec.pth")

    test_loader = DataLoader(test_dataset, batch_size=1,
                             shuffle=False, drop_last=False)

    rmse = 0
    p = 0
    predicted = []
    true_target = []
    enc.eval()
    dec.eval()
    for encoder_input, decoder_input, target in progress_bar(test_loader, parent=mb):
        with torch.no_grad():
            enc_vec = enc(encoder_input)
            x = decoder_input[:, 0]
            h, c = dec.initHidden(1)
            pred = []
            for pi in range(predict_len):
                x, h, c = dec(x, h, c, enc_vec)
                pred += [x]
            pred = torch.cat(pred, dim=1)
            predicted += [pred[0, p].item()]
            true_target += [target[0, p].item()]
    predicted = np.array(predicted).reshape(1, -1)
    predicted = predicted * (train_max - train_min) + train_min
    true_target = np.array(true_target).reshape(1, -1)
    true_target = true_target * (train_max - train_min) + train_min
    rmse, peasonr = calc_metric(predicted, true_target)
    print(f"{region} RMSE {rmse}")
    print(f"{region} r {peasonr[0]}")
    return f"{region} RMSE {rmse} r {peasonr[0]}"
Beispiel #16
0
def main():
    now = datetime.datetime.now()
    now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format(
        now.year, now.month, now.day, now.hour, now.minute, now.second)

    # set logger
    logger = logging.getLogger("Log")
    logger.setLevel(logging.DEBUG)

    handler_format = logging.Formatter(
        '%(asctime)s - %(levelname)s - %(message)s')

    # stream_handler = logging.StreamHandler()
    # stream_handler.setLevel(logging.DEBUG)
    # stream_handler.setFormatter(handler_format)
    # logger.addHandler(stream_handler)

    print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour,
                                     now.minute, now.second))
    with open('../params/exp{}.yaml'.format(EXP_NO), "r+") as f:
        param = yaml.load(f, Loader=yaml.FullLoader)
    param['date'] = now_date
    # seed set
    seed_setting(param['seed'])
    if torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True

    local_cv = dict()

    for fold in param['fold']:
        # /mnt/hdd1/alcon2019/ + exp0/ + 2019-mm-dd_hh-mm-ss/ + foldN
        outdir = os.path.join(param['save path'], EXP_NAME, now_date,
                              'fold{}'.format(fold))
        if os.path.exists(param['save path']):
            os.makedirs(outdir, exist_ok=True)
        else:
            print("Not find {}".format(param['save path']))
            raise FileNotFoundError

        file_handler = logging.FileHandler(
            os.path.join(outdir, 'experiment.log'))
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(handler_format)
        logger.addHandler(file_handler)

        logger.debug('=============   FOLD  {}  ============='.format(fold))
        logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day,
                                                now.hour, now.minute,
                                                now.second))

        # Dataset

        param['batch size'] = max(param['batch size'],
                                  param['batch size'] * param['GPU'])
        if param['debug']:
            train_dataset = AlconDataset(
                df=get_train_df(param['tabledir']).query(
                    'valid != @fold').iloc[:param['batch size'] * 12],
                augmentation=get_train_augmentation(
                    *get_resolution(param['resolution'])),
                datadir=os.path.join(param['dataroot'], 'train', 'imgs'),
                mode='train')

            valid_dataset = AlconDataset(
                df=get_train_df(param['tabledir']).query(
                    'valid == @fold').iloc[:param['batch size'] * 12],
                augmentation=get_test_augmentation(
                    *get_resolution(param['resolution'])),
                datadir=os.path.join(param['dataroot'], 'train', 'imgs'),
                mode='valid')
        else:
            train_dataset = AlconDataset(
                df=get_train_df(param['tabledir']).query('valid != @fold'),
                augmentation=get_train_augmentation(
                    *get_resolution(param['resolution'])),
                datadir=os.path.join(param['dataroot'], 'train', 'imgs'),
                mode='train',
                margin_augmentation=True)

            valid_dataset = AlconDataset(
                df=get_train_df(param['tabledir']).query('valid == @fold'),
                augmentation=get_test_augmentation(
                    *get_resolution(param['resolution'])),
                datadir=os.path.join(param['dataroot'], 'train', 'imgs'),
                mode='valid',
                margin_augmentation=False)

        logger.debug('train dataset size: {}'.format(len(train_dataset)))
        logger.debug('valid dataset size: {}'.format(len(valid_dataset)))

        # Dataloader

        train_dataloader = DataLoader(train_dataset,
                                      batch_size=param['batch size'],
                                      num_workers=param['thread'],
                                      pin_memory=False,
                                      drop_last=False,
                                      shuffle=True)
        valid_dataloader = DataLoader(valid_dataset,
                                      batch_size=param['batch size'],
                                      num_workers=param['thread'],
                                      pin_memory=False,
                                      drop_last=False,
                                      shuffle=False)

        logger.debug('train loader size: {}'.format(len(train_dataloader)))
        logger.debug('valid loader size: {}'.format(len(valid_dataloader)))

        # model
        model = DenseNet201GRU2(num_classes=48,
                                hidden_size=512,
                                bidirectional=True,
                                load_weight=None,
                                dropout=param['dropout'])

        param['model'] = model.__class__.__name__

        # optim
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=0.001,
                                    momentum=0.9,
                                    weight_decay=1e-5,
                                    nesterov=False)
        # scheduler

        model = model.to(param['device'])
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        if param['GPU'] > 0:
            model = nn.DataParallel(model)

        loss_fn = nn.CrossEntropyLoss().to(param['device'])
        eval_fn = accuracy_one_character

        max_char_acc = -1.
        max_3char_acc = -1.
        min_loss = 10**5

        writer = tbx.SummaryWriter("../log/exp{}/{}/fold{}".format(
            EXP_NO, now_date, fold))

        for key, val in param.items():
            writer.add_text('data/hyperparam/{}'.format(key), str(val), 0)

        max_char_acc = -1e-5
        max_3char_acc = -1e-5
        min_loss = 1e+5

        snapshot = 0
        snapshot_loss_list = list()
        snapshot_eval_list = list()
        snapshot_eval3_list = list()
        snapshot_loss = 1e+5
        snapshot_eval = -1e-5
        snapshot_eval3 = -1e-5
        val_iter = math.ceil(len(train_dataloader) / 3)
        print('val_iter: {}'.format(val_iter))
        # Hyper params
        cycle_iter = 5
        snap_start = 2
        n_snap = 8

        mb = master_bar(range((n_snap + snap_start) * cycle_iter))
        scheduler = CosineAnnealingWarmUpRestarts(optimizer,
                                                  T_0=len(train_dataloader) *
                                                  cycle_iter,
                                                  T_mult=1,
                                                  T_up=500,
                                                  eta_max=0.1)

        for epoch in mb:
            if epoch % cycle_iter == 0 and epoch >= snap_start * cycle_iter:
                if snapshot > 1:
                    snapshot_loss_list.append(snapshot_loss)
                    snapshot_eval_list.append(snapshot_eval)
                    snapshot_eval3_list.append(snapshot_eval3)
                snapshot += 1
                snapshot_loss = 10**5
                snapshot_eval = 0.0
                snapshot_eval3 = 0.0
            model.train()
            avg_train_loss = 10**5
            avg_train_accuracy = 0.0
            avg_three_train_acc = 0.0
            for step, (inputs, targets, indice) in enumerate(
                    progress_bar(train_dataloader, parent=mb)):
                model.train()
                inputs = inputs.to(param['device'])
                targets = targets.to(param['device'])
                optimizer.zero_grad()
                logits = model(inputs)  # logits.size() = (batch*3, 48)
                preds = logits.view(targets.size(0), 3, -1).softmax(dim=2)
                loss = loss_fn(logits,
                               targets.view(-1, targets.size(2)).argmax(dim=1))
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                # loss.backward()
                # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                avg_train_loss += loss.item()
                _avg_accuracy = eval_fn(preds, targets.argmax(dim=2)).item()
                avg_train_accuracy += _avg_accuracy
                _three_char_accuracy = accuracy_three_character(
                    preds, targets.argmax(dim=2), mean=True).item()
                avg_three_train_acc += _three_char_accuracy
                writer.add_scalar("data/learning rate",
                                  scheduler.get_lr()[0],
                                  step + epoch * len(train_dataloader))
                scheduler.step()
                writer.add_scalars(
                    "data/metric/train", {
                        'loss': loss.item(),
                        'accuracy': _avg_accuracy,
                        '3accuracy': _three_char_accuracy
                    }, step + epoch * len(train_dataloader))
                if step % val_iter == 0 and step != 0:
                    avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn(
                        model, valid_dataloader, param['device'], loss_fn,
                        eval_fn)
                    writer.add_scalars(
                        "data/metric/valid", {
                            'loss': avg_valid_loss,
                            'accuracy': avg_valid_accuracy,
                            '3accuracy': avg_three_valid_acc
                        }, epoch)

                    logger.debug(
                        '======================== epoch {} | step {} ========================'
                        .format(epoch + 1, step + 1))
                    logger.debug('lr              : {:.5f}'.format(
                        scheduler.get_lr()[0]))
                    logger.debug(
                        'loss            : test={:.5f}'.format(avg_valid_loss))
                    logger.debug('acc(per 1 char) : test={:.3%}'.format(
                        avg_valid_accuracy))
                    logger.debug('acc(per 3 char) : test={:.3%}'.format(
                        avg_three_valid_acc))

                    if min_loss > avg_valid_loss:
                        logger.debug(
                            'update best loss:  {:.5f} ---> {:.5f}'.format(
                                min_loss, avg_valid_loss))
                        min_loss = avg_valid_loss
                        torch.save(model.state_dict(),
                                   os.path.join(outdir, 'best_loss.pth'))

                    if max_char_acc < avg_valid_accuracy:
                        logger.debug(
                            'update best acc per 1 char:  {:.3%} ---> {:.3%}'.
                            format(max_char_acc, avg_valid_accuracy))
                        max_char_acc = avg_valid_accuracy
                        torch.save(model.state_dict(),
                                   os.path.join(outdir, 'best_acc.pth'))

                    if max_3char_acc < avg_three_valid_acc:
                        logger.debug(
                            'update best acc per 3 char:  {:.3%} ---> {:.3%}'.
                            format(max_3char_acc, avg_three_valid_acc))
                        max_3char_acc = avg_three_valid_acc
                        torch.save(model.state_dict(),
                                   os.path.join(outdir, 'best_3acc.pth'))
                    if snapshot > 0:
                        if snapshot_loss > avg_valid_loss:
                            logger.debug(
                                '[snap] update best loss:  {:.5f} ---> {:.5f}'.
                                format(snapshot_loss, avg_valid_loss))
                            snapshot_loss = avg_valid_loss
                            torch.save(
                                model.state_dict(),
                                os.path.join(outdir,
                                             f'best_loss_{snapshot}.pth'))

                        if snapshot_eval < avg_valid_accuracy:
                            logger.debug(
                                '[snap] update best acc per 1 char:  {:.3%} ---> {:.3%}'
                                .format(snapshot_eval, avg_valid_accuracy))
                            snapshot_eval = avg_valid_accuracy
                            torch.save(
                                model.state_dict(),
                                os.path.join(outdir,
                                             f'best_acc_{snapshot}.pth'))

                        if snapshot_eval3 < avg_three_valid_acc:
                            logger.debug(
                                '[snap] update best acc per 3 char:  {:.3%} ---> {:.3%}'
                                .format(snapshot_eval3, avg_three_valid_acc))
                            snapshot_eval3 = avg_three_valid_acc
                            torch.save(
                                model.state_dict(),
                                os.path.join(outdir,
                                             f'best_3acc_{snapshot}.pth'))

            avg_train_loss /= len(train_dataloader)
            avg_train_accuracy /= len(train_dataloader)
            avg_three_train_acc /= len(train_dataloader)

            avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn(
                model, valid_dataloader, param['device'], loss_fn, eval_fn)

            writer.add_scalars(
                "data/metric/valid", {
                    'loss': avg_valid_loss,
                    'accuracy': avg_valid_accuracy,
                    '3accuracy': avg_three_valid_acc
                }, epoch)

            logger.debug(
                '======================== epoch {} ========================'.
                format(epoch + 1))
            logger.debug('lr              : {:.5f}'.format(
                scheduler.get_lr()[0]))
            logger.debug(
                'loss            : train={:.5f}  , test={:.5f}'.format(
                    avg_train_loss, avg_valid_loss))
            logger.debug(
                'acc(per 1 char) : train={:.3%}  , test={:.3%}'.format(
                    avg_train_accuracy, avg_valid_accuracy))
            logger.debug(
                'acc(per 3 char) : train={:.3%}  , test={:.3%}'.format(
                    avg_three_train_acc, avg_three_valid_acc))

            if epoch == cycle_iter * snap_start:
                torch.save(
                    model.state_dict(),
                    os.path.join(outdir,
                                 f'model_epoch_{cycle_iter * snap_start}.pth'))

            if min_loss > avg_valid_loss:
                logger.debug('update best loss:  {:.5f} ---> {:.5f}'.format(
                    min_loss, avg_valid_loss))
                min_loss = avg_valid_loss
                torch.save(model.state_dict(),
                           os.path.join(outdir, 'best_loss.pth'))

            if max_char_acc < avg_valid_accuracy:
                logger.debug(
                    'update best acc per 1 char:  {:.3%} ---> {:.3%}'.format(
                        max_char_acc, avg_valid_accuracy))
                max_char_acc = avg_valid_accuracy
                torch.save(model.state_dict(),
                           os.path.join(outdir, 'best_acc.pth'))

            if max_3char_acc < avg_three_valid_acc:
                logger.debug(
                    'update best acc per 3 char:  {:.3%} ---> {:.3%}'.format(
                        max_3char_acc, avg_three_valid_acc))
                max_3char_acc = avg_three_valid_acc
                torch.save(model.state_dict(),
                           os.path.join(outdir, 'best_3acc.pth'))
            if snapshot > 0:
                if snapshot_loss > avg_valid_loss:
                    logger.debug(
                        '[snap] update best loss:  {:.5f} ---> {:.5f}'.format(
                            snapshot_loss, avg_valid_loss))
                    snapshot_loss = avg_valid_loss
                    torch.save(
                        model.state_dict(),
                        os.path.join(outdir, f'best_loss_{snapshot}.pth'))

                if snapshot_eval < avg_valid_accuracy:
                    logger.debug(
                        '[snap] update best acc per 1 char:  {:.3%} ---> {:.3%}'
                        .format(snapshot_eval, avg_valid_accuracy))
                    snapshot_eval = avg_valid_accuracy
                    torch.save(
                        model.state_dict(),
                        os.path.join(outdir, f'best_acc_{snapshot}.pth'))

                if snapshot_eval3 < avg_three_valid_acc:
                    logger.debug(
                        '[snap] update best acc per 3 char:  {:.3%} ---> {:.3%}'
                        .format(snapshot_eval3, avg_three_valid_acc))
                    snapshot_eval3 = avg_three_valid_acc
                    torch.save(
                        model.state_dict(),
                        os.path.join(outdir, f'best_3acc_{snapshot}.pth'))

        snapshot_loss_list.append(snapshot_loss)
        snapshot_eval_list.append(snapshot_eval)
        snapshot_eval3_list.append(snapshot_eval3)
        writer.add_scalars(
            "data/metric/valid", {
                'best loss': min_loss,
                'best accuracy': max_char_acc,
                'best 3accuracy': max_3char_acc
            })

        logger.debug('================  FINISH  TRAIN  ================')
        logger.debug('Result')
        logger.debug('Best loss : {}'.format(min_loss))
        logger.debug('Best 1 acc : {}'.format(max_char_acc))
        logger.debug('Best 3 acc : {}'.format(max_3char_acc))
        writer.export_scalars_to_json(os.path.join(outdir, 'history.json'))
        writer.close()

        # Local cv

        target_list = list()
        for _, targets, _ in valid_dataloader:
            targets = targets.argmax(dim=2)
            target_list.append(targets)
        target_list = torch.cat(target_list)

        mb = master_bar(range(n_snap))
        valid_logit_dict = dict()
        init = True
        for i in mb:
            model.load_state_dict(
                torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth')))
            logit_alcon_rnn(model,
                            valid_dataloader,
                            param['device'],
                            valid_logit_dict,
                            div=n_snap,
                            init=init)
            init = False

        pred_list = torch.stack(list(valid_logit_dict.values()))
        pred_list = pred_list.softmax(dim=2)
        local_accuracy = accuracy_three_character(pred_list, target_list)
        logger.debug('LOCAL CV : {:5%}'.format(local_accuracy))
        torch.save(valid_logit_dict,
                   os.path.join(outdir, f'fold{fold}_valid_logit.pth'))

        local_cv['fold{}'.format(fold)] = {
            'accuracy': local_accuracy,
            'valid_size': len(valid_dataset)
        }

        del train_dataset, valid_dataset
        del train_dataloader, valid_dataloader
        del scheduler, optimizer
        del valid_logit_dict, target_list
        gc.collect()

        logger.debug('=========== Prediction phrase ===========')

        if param['debug']:
            test_dataset = AlconDataset(
                df=get_test_df(param['tabledir']).iloc[:param['batch size'] *
                                                       12],
                augmentation=get_test_augmentation(
                    *get_resolution(param['resolution'])),
                datadir=os.path.join(param['dataroot'], 'test', 'imgs'),
                mode='test')
        else:
            test_dataset = AlconDataset(
                df=get_test_df(param['tabledir']),
                augmentation=get_test_augmentation(
                    *get_resolution(param['resolution'])),
                datadir=os.path.join(param['dataroot'], 'test', 'imgs'),
                mode='test')

        test_dataloader = DataLoader(test_dataset,
                                     batch_size=param['batch size'],
                                     num_workers=param['thread'],
                                     pin_memory=False,
                                     drop_last=False,
                                     shuffle=False)
        logger.debug('test dataset size: {}'.format(len(test_dataset)))
        logger.debug('test loader size: {}'.format(len(test_dataloader)))

        test_logit_dict = dict()
        init = True
        for i in range(n_snap):
            logger.debug('load weight  :  {}'.format(
                os.path.join(outdir, f'best_loss_{i+1}.pth')))
            model.load_state_dict(
                torch.load(os.path.join(outdir, f'best_loss_{i+1}.pth')))
            logit_alcon_rnn(model,
                            test_dataloader,
                            param['device'],
                            test_logit_dict,
                            div=n_snap,
                            init=init)
            init = False

        torch.save(test_logit_dict, os.path.join(outdir, 'prediction.pth'))
        output_list = make_submission(test_logit_dict)
        pd.DataFrame(output_list).sort_values('ID').set_index('ID').to_csv(
            os.path.join(outdir, 'test_prediction.csv'))
        logger.debug('success!')
        logger.removeHandler(file_handler)

        del test_dataset, test_dataloader
        gc.collect()

    # Ensemble
    print('======== Ensemble phase =========')
    emsemble_prediction = dict()
    mb = master_bar(param['fold'])

    print('======== Load Vector =========')
    for i, fold in enumerate(mb):
        outdir = os.path.join(param['save path'], EXP_NAME, now_date,
                              'fold{}'.format(fold))
        prediction = torch.load(os.path.join(outdir, 'prediction.pth'))
        # prediction is list
        # prediction[0] = {'ID' : 0, 'logit' torch.tensor, ...}
        if i == 0:
            for ID, logit in progress_bar(prediction.items(), parent=mb):
                emsemble_prediction[ID] = logit / len(param['fold'])
        else:
            for ID, logit in progress_bar(prediction.items(), parent=mb):
                emsemble_prediction[ID] += logit / len(param['fold'])

    #
    outdir = os.path.join(param['save path'], EXP_NAME, now_date)
    #
    file_handler = logging.FileHandler(os.path.join(outdir, 'result.log'))
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(handler_format)
    logger.addHandler(file_handler)
    logger.info(' ==========  RESULT  ========== \n')
    #
    cv = 0.0
    train_data_size = 0
    for fold in param['fold']:
        acc = local_cv['fold{}'.format(fold)]['accuracy']
        valid_size = local_cv['fold{}'.format(fold)]['valid_size']
        train_data_size += valid_size
        logger.info(' fold {} :  {:.3%} \n'.format(fold, acc))
        cv += acc * valid_size
    logger.info(' Local CV : {:.3%} \n'.format(cv / train_data_size))
    logger.info(' ============================== \n')
    #
    logger.removeHandler(file_handler)
    #
    #
    torch.save(emsemble_prediction, os.path.join(outdir, 'prediction.pth'))
    #
    print('======== make submittion file =========')

    submit_list = make_submission(emsemble_prediction)
    pd.DataFrame(submit_list).sort_values('ID').set_index('ID').to_csv(
        os.path.join(outdir, 'test_prediction.csv'))

    print('success!')
Beispiel #17
0
    def calculate_map(self,
                      model: nn.Module,
                      conf_threshold: float = 0.01,
                      iou_threshold: float = 0.50,
                      same_threshold: float = 0.45,
                      max_preds: float = 200,
                      plot: bool = True,
                      use_gpu: bool = True) -> float:
        """Compute the mean average precision (mAP) achieved by a model on
        this dataset"""
        model.eval()

        dl = DataLoader(self, batch_size=32, drop_last=False, num_workers=32)

        # First, populate a dictionary with class ids as keys, and tuples of
        # the form (confidence, is_correct, bbox) for each bounding box
        # predicted for that class
        predictions = defaultdict(list)
        num_targs = defaultdict(
            int)  # Dict with number of ground truth boxes for each class

        mb = master_bar(dl)
        for imgs, targs in mb:
            if use_gpu: imgs = imgs.cuda()
            preds = model(imgs).cpu()
            for pred, targ in progress_bar(list(zip(preds, targs)), parent=mb):
                # Process Targets
                targ_boxes, targ_classes, _ = tensor2boxes(
                    self.matcher.default_boxes, targ)
                targ_boxes, filtered_idxs = filter_overlapping_boxes(
                    targ_boxes, iou_threshold=0.95)
                targ_classes = targ_classes[filtered_idxs]

                for targ_class in targ_classes:
                    num_targs[targ_class.item()] += 1

                # Process Predictions
                pred_boxes, pred_classes, pred_confs = nms(
                    pred, self.matcher.default_boxes, conf_threshold,
                    same_threshold, max_preds)

                # Match Prediction to Targets for each Class
                matched_targs = set()
                for pred_box, pred_class, pred_conf in zip(
                        pred_boxes, pred_classes, pred_confs):
                    # Indices of targets in the same class as the current prediction
                    same_classes = (targ_classes == pred_class).float()
                    same_class_idxs = set(
                        same_classes.nonzero().flatten().numpy())

                    # Indices of targets that overlap sufficiently with the current prediction
                    overlaps = jaccard_overlap(pred_box, targ_boxes)
                    above_thresholds = (overlaps > iou_threshold).float()
                    above_threshold_idxs = set(
                        above_thresholds.nonzero().flatten().numpy())

                    # Indices of targets that are both in the same class and overlap sufficiently
                    # with the current prediction
                    valid_idxs = same_class_idxs.intersection(
                        above_threshold_idxs)

                    # Target indices in order of decreasing overlap with the current prediction
                    valid_idxs = list(valid_idxs)
                    valid_idxs.sort(key=lambda idx: overlaps[idx],
                                    reverse=True)
                    valid_idxs = [
                        idx for idx in valid_idxs if idx not in matched_targs
                    ]

                    pred_box_matched = False
                    if len(valid_idxs):
                        targ_idx = valid_idxs[0]
                        matched_targs.add(targ_idx)
                        pred_box_matched = True

                    pred_conf = pred_conf.item()
                    pred_box = pred_box.detach().cpu().numpy().tolist()
                    predictions[pred_class].append(
                        (pred_conf, pred_box_matched, pred_box))

        # Calculate Average Precision for each Class
        all_classes = set(num_targs.keys()).union(predictions.keys())

        avg_precisions = []
        for class_idx in all_classes:
            tps, fps, fns = 0, 0, num_targs[class_idx]
            if fns == 0:
                avg_precisions.append(1)
                continue

            precisions, recalls = [], []

            # Sort Predictions in order of decreasing confidence
            class_preds = predictions[class_idx]
            class_preds = [(conf, is_correct)
                           for conf, is_correct, _ in class_preds]
            class_preds.sort(
                reverse=True)  # Sort in order of decreasing confidence

            for _, is_correct in class_preds:
                if is_correct:
                    tps += 1
                    fns -= 1
                else:
                    fps += 1

                precision = tps / (tps + fps) if tps + fps > 0 else 0
                recall = tps / (tps + fns) if tps + fns > 0 else 0

                if not (recalls and recalls[-1] == recall):
                    precisions.append(precision)
                    recalls.append(recall)

            precisions_adj = [
                max(precisions[idx:]) for idx in range(len(precisions))
            ]

            avg_precision = 0
            for idx, precision in enumerate(precisions_adj[:-1]):
                increment = recalls[idx + 1] - recalls[idx]
                avg_precision += precision * increment

            print(
                f"\nAP for {self.categories[class_idx].capitalize()}: {round(avg_precision, 4)}"
            )

            if plot:
                plt.plot(recalls, precisions_adj)
                plt.title(self.categories[class_idx].capitalize())
                plt.xlabel("Recall")
                plt.ylim(0, 1)
                plt.xlim(0, 1)
                plt.ylabel("Precision")
                plt.show()

            avg_precisions.append(avg_precision)

        mean_avg_precision = np.mean(avg_precisions)
        return mean_avg_precision
Beispiel #18
0
    def train(self, train_df, target_df):
        oof = np.zeros((len(train_df), self.cfg.model.n_classes))
        cv = 0

        for fold_, col in enumerate(self.fold_df.columns):
            print(
                f'\n========================== FOLD {fold_} ... ==========================\n'
            )
            logging.debug(
                f'\n========================== FOLD {fold_} ... ==========================\n'
            )

            trn_x, val_x = train_df[self.fold_df[col] == 0], train_df[
                self.fold_df[col] > 0]
            val_y = target_df[self.fold_df[col] > 0].values

            train_loader = factory.get_dataloader(trn_x, self.cfg.data.train)
            valid_loader = factory.get_dataloader(val_x, self.cfg.data.valid)

            model = factory.get_nn_model(self.cfg).to(device)

            criterion = factory.get_loss(self.cfg)
            optimizer = factory.get_optim(self.cfg, model.parameters())
            scheduler = factory.get_scheduler(self.cfg, optimizer)

            best_epoch = -1
            best_val_score = -np.inf
            mb = master_bar(range(self.cfg.model.epochs))

            train_loss_list = []
            val_loss_list = []
            val_score_list = []

            for epoch in mb:
                start_time = time.time()

                model, avg_loss = self._train_epoch(model, train_loader,
                                                    criterion, optimizer, mb)

                valid_preds, avg_val_loss = self._val_epoch(
                    model, valid_loader, criterion)

                val_score = factory.get_metrics(self.cfg.common.metrics.name)(
                    val_y, valid_preds)

                train_loss_list.append(avg_loss)
                val_loss_list.append(avg_val_loss)
                val_score_list.append(val_score)

                if self.cfg.scheduler.name != 'ReduceLROnPlateau':
                    scheduler.step()
                elif self.cfg.scheduler.name == 'ReduceLROnPlateau':
                    scheduler.step(avg_val_loss)

                elapsed = time.time() - start_time
                mb.write(
                    f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s'
                )
                logging.debug(
                    f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s'
                )

                if val_score > best_val_score:
                    best_epoch = epoch + 1
                    best_val_score = val_score
                    best_valid_preds = valid_preds
                    if self.cfg.model.multi_gpu:
                        best_model = model.module.state_dict()
                    else:
                        best_model = model.state_dict()

            oof[val_x.index, :] = best_valid_preds
            cv += best_val_score * self.fold_df[col].max()

            torch.save(best_model,
                       f'../logs/{self.run_name}/weight_best_{fold_}.pt')
            self._save_loss_png(train_loss_list, val_loss_list, val_score_list,
                                fold_)

            print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}')
            logging.debug(
                f'\nEpoch {best_epoch} - val_score: {best_val_score:.4f}')

        print('\n\n===================================\n')
        print(f'CV: {cv:.6f}')
        logging.debug(f'\n\nCV: {cv:.6f}')
        print('\n===================================\n\n')

        self.oof = oof.reshape(-1, 5)

        return cv
Beispiel #19
0
def get_opt_rf_params(x_trn:np.ndarray, y_trn:np.ndarray, x_val:np.ndarray, y_val:np.ndarray, objective:str,
                      w_trn:Optional[np.ndarray]=None, w_val:Optional[np.ndarray]=None,
                      params:Optional[OrderedDict]=None, n_estimators:int=40, verbose=True) \
        -> Tuple[Dict[str,float],Union[RandomForestRegressor,RandomForestClassifier]]:
    r'''
    Use an ordered parameter-scan to roughly optimise Random Forest hyper-parameters.

    Arguments:
        x_trn: training input data
        y_trn: training target data
        x_val: validation input data
        y_val: validation target data
        objective: string representation of objective: either 'classification' or 'regression'
        w_trn: training weights
        w_val: validation weights
        params: ordered dictionary mapping parameters to optimise to list of values to cosnider
        n_estimators: number of trees to use in each forest
        verbose: Print extra information and show a live plot of model performance

    Returns:
        params: dictionary mapping parameters to their optimised values
        rf: best performing Random Forest
    '''
    if params is None:
        params = OrderedDict({
            'min_samples_leaf': [1, 3, 5, 10, 25, 50, 100],
            'max_features': [0.3, 0.5, 0.7, 0.9]
        })
    rf = RandomForestClassifier if 'class' in objective.lower(
    ) else RandomForestRegressor
    best_params = {
        'n_estimators': n_estimators,
        'n_jobs': -1,
        'max_features': 'sqrt'
    }
    best_scores = []
    scores = []
    mb = master_bar(params)
    mb.names = ['Best', 'Scores']
    if verbose: mb.update_graph([[[], []], [[], []]])
    for param in mb:
        pb = progress_bar(params[param], parent=mb)
        pb.comment = f'{param} = {params[param][0]}'
        for i, value in enumerate(pb):
            pb.comment = f'{param} = {params[param][min(i+1, len(params[param])-1)]}'
            m = rf(**{**best_params, param: value})
            m.fit(X=x_trn, y=y_trn, sample_weight=w_trn)
            scores.append(m.score(X=x_val, y=y_val, sample_weight=w_val))
            if len(best_scores) == 0 or scores[-1] > best_scores[-1]:
                best_scores.append(scores[-1])
                best_params[param] = value
                if verbose:
                    print(
                        f'Better score schieved: {param} @ {value} = {best_scores[-1]:.4f}'
                    )
                best_m = m
            else:
                best_scores.append(best_scores[-1])
            if verbose:
                mb.update_graph([[range(len(best_scores)), best_scores],
                                 [range(len(scores)), scores]])

    if verbose: delattr(mb, 'fig')
    if verbose: plt.clf()
    return best_params, best_m
Beispiel #20
0
 def begin_fit(self, e: Event):
     self.mbar = master_bar(range(e.learn.epochs))
     self.mbar.on_iter_begin()
     e.learn.logger = partial(self.mbar.write, table=True)
Beispiel #21
0
def run_training(model, optimizer, loss_function, device, num_epochs, 
                train_dataloader, val_dataloader, early_stopper=None, verbose=False):
    """Run model training.

    Args:
        model (nn.Module): Torch model to train
        optimizer: Torch optimizer object
        loss_fn: Torch loss function for training
        device (torch.device): Torch device to use for training
        num_epochs (int): Max. number of epochs to train
        train_dataloader (DataLoader): Torch DataLoader object to load the
            training data
        val_dataloader (DataLoader): Torch DataLoader object to load the
            validation data
        early_stopper (EarlyStopper, optional): If passed, model will be trained
            with early stopping. Defaults to None.
        verbose (bool, optional): Print information about model training. 
            Defaults to False.

    Returns:
        list, list, list, list, torch.Tensor shape (10,10): Return list of train
            losses, validation losses, train accuracies, validation accuracies
            per epoch and the confusion matrix evaluated in the last epoch.
    """
    start_time = time.time()
    master_bar = fastprogress.master_bar(range(num_epochs))
    train_losses, val_losses, train_accs, val_accs = [],[],[],[]

    for epoch in master_bar:
        # Train the model
        epoch_train_loss, epoch_train_acc = train(train_dataloader, optimizer, model, 
                                                  loss_function, device, master_bar)
        # Validate the model
        epoch_val_loss, epoch_val_acc, confusion_matrix = validate(val_dataloader, 
                                                                   model, loss_function, 
                                                                   device, master_bar)

        # Save loss and acc for plotting
        train_losses.append(epoch_train_loss)
        val_losses.append(epoch_val_loss)
        train_accs.append(epoch_train_acc)
        val_accs.append(epoch_val_acc)
        
        if verbose:
            master_bar.write(f'Train loss: {epoch_train_loss:.2f}, val loss: {epoch_val_loss:.2f}, train acc: {epoch_train_acc:.3f}, val acc {epoch_val_acc:.3f}')
            
        if early_stopper:
            ####################
            ## YOUR CODE HERE ##
            ####################
            early_stopper.update(epoch_val_acc, model)
            if early_stopper.early_stop:
              early_stopper.load_checkpoint(model)
              print("Early stopping, since the validation accuracy did not increase. Epoch: {}".format(epoch))
              break

            # END OF YOUR CODE #
            
    time_elapsed = np.round(time.time() - start_time, 0).astype(int)
    print(f'Finished training after {time_elapsed} seconds.')
    return train_losses, val_losses, train_accs, val_accs, confusion_matrix
Beispiel #22
0
mito_train = [fn for fn in hr_mito]

neuron_path = datasources / 'live_neuron_mito_timelapse_for_deep_learning'
two_channel = list(neuron_path.glob('*MTGreen*.czi'))
one_channel = [x for x in neuron_path.glob('*.czi') if x not in two_channel]

airyscan_path = datasources / 'Airyscan_processed_data_from_the_server'
hr_airyscan = list(airyscan_path.glob('*.czi'))

for fn in hr_mito:
    if '03-Airyscan' in fn.stem: valid_files.append(fn)
    else: train_files.append(fn)

for lst in [hr_airyscan, one_channel, two_channel]:
    lst.sort()
    random.shuffle(lst)
    split_idx = int(valid_pct * len(lst))
    print(split_idx)
    valid_files += lst[-split_idx:]
    train_files += lst[:-split_idx]
    for subdir, file_list in [('train', train_files), ('valid', valid_files)]:
        print(f'\n\ncopy, crappify and upsample {subdir} files\n\n')
        pbar = master_bar(file_list)
        for czi_fn in pbar:
            czi_to_multiframe(czi_fn,
                              hr_path / subdir,
                              lr_path / subdir,
                              lr_up_path / subdir,
                              pbar=pbar)
Beispiel #23
0
def train_old(model,
              optimizer,
              criterion,
              dataset,
              batch_size,
              num_epochs,
              num_workers=0,
              half=True):

    dataloader = {
        x: torch.utils.data.DataLoader(dataset[x],
                                       batch_size=batch_size,
                                       shuffle=True,
                                       num_workers=num_workers)
        for x in ['train', 'val']
    }
    dataset_sizes = {x: len(dataset[x]) for x in ['train', 'val']}

    # Decay LR by a factor of 0.1 every 7 epochs
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=7,
                                                gamma=0.1)

    since = time.time()
    best_acc = 0.0
    #best_model_wts = copy.deepcopy(model.state_dict())

    mb = master_bar(range(num_epochs))
    mb.names = ['train', 'val']
    mb.write("Epoch\tTrn_loss\tVal_loss\tTrn_acc\t\tVal_acc")
    # Iterate epochs
    #for epoch in range(num_epochs):
    for epoch in mb:

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()  # Scheduling the learning rate
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            #for inputs, labels in dataloader[phase]:
            for inputs, labels in progress_bar(dataloader[phase], parent=mb):
                inputs = inputs.to(device)
                labels = labels.to(device)
                if half: inputs = inputs.half()

                optimizer.zero_grad()  # zero the parameter gradients
                outputs = model(inputs)  # forward
                preds = torch.argmax(outputs, dim=1)  # prediction
                loss = criterion(outputs, labels)  # loss
                if phase == 'train': loss.backward()  # backward
                if phase == 'train': optimizer.step()  # optimize

                # statistics
                running_loss += loss.item() * inputs.size(
                    0
                )  # multiplicar si nn.CrossEntropyLoss(size_average=True) que es lo default
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            metrics[phase]["loss"].append(epoch_loss)
            metrics[phase]["acc"].append(epoch_acc)
            #draw_plot(metrics)

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                #best_model_wts = copy.deepcopy(model.state_dict())

        x = list(range(len(metrics["train"]["acc"])))
        graphs = [[x, metrics["train"]["acc"]], [x, metrics["val"]["acc"]]]
        mb.update_graph(graphs)
        mb.write("{}/{}\t{:06.6f}\t{:06.6f}\t{:06.6f}\t{:06.6f}".format(
            epoch + 1, num_epochs, metrics["train"]["loss"][-1],
            metrics["val"]["loss"][-1], metrics["train"]["acc"][-1],
            metrics["val"]["acc"][-1]))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
Beispiel #24
0
def main(args):

    if args.deterministic:
        set_seed(42)

    # Set device
    if args.device is None:
        if torch.cuda.is_available():
            args.device = 'cuda:0'
        else:
            args.device = 'cpu'

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_transforms = transforms.Compose([
        transforms.RandomResizedCrop((args.resize, args.resize)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ToTensor(), normalize
    ])

    test_transforms = transforms.Compose([
        transforms.Resize((args.resize, args.resize)),
        transforms.ToTensor(), normalize
    ])

    # Train & test sets
    train_set = OpenFire(root=args.data_path,
                         train=True,
                         download=True,
                         valid_pct=0.2,
                         transform=train_transforms)
    val_set = OpenFire(root=args.data_path,
                       train=False,
                       download=True,
                       valid_pct=0.2,
                       transform=test_transforms)
    num_classes = len(train_set.classes)
    # Samplers
    train_sampler = torch.utils.data.RandomSampler(train_set)
    test_sampler = torch.utils.data.SequentialSampler(val_set)

    # Data loader
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               num_workers=args.workers,
                                               pin_memory=True)
    test_loader = torch.utils.data.DataLoader(val_set,
                                              batch_size=args.batch_size,
                                              sampler=test_sampler,
                                              num_workers=args.workers,
                                              pin_memory=True)

    # Model definition
    model = torchvision.models.__dict__[args.model](pretrained=args.pretrained)

    # Change fc
    in_features = getattr(model, 'fc').in_features
    setattr(model, 'fc', nn.Linear(in_features, num_classes))
    model.to(args.device)

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # optimizer
    optimizer = optim.Adam(model.parameters(),
                           betas=(0.9, 0.99),
                           weight_decay=args.weight_decay)

    # Scheduler
    lr_scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=args.lr,
        epochs=args.epochs,
        steps_per_epoch=len(train_loader),
        cycle_momentum=(not isinstance(optimizer, optim.Adam)),
        div_factor=args.div_factor,
        final_div_factor=args.final_div_factor)

    best_loss = math.inf
    mb = master_bar(range(args.epochs))
    for epoch_idx in mb:
        # Training
        train_loss = train_epoch(model,
                                 train_loader,
                                 optimizer,
                                 criterion,
                                 master_bar=mb,
                                 epoch=epoch_idx,
                                 scheduler=lr_scheduler,
                                 device=args.device)

        # Evaluation
        val_loss, acc = evaluate(model,
                                 test_loader,
                                 criterion,
                                 device=args.device)

        mb.first_bar.comment = f"Epoch {epoch_idx+1}/{args.epochs}"
        mb.write(
            f'Epoch {epoch_idx+1}/{args.epochs} - Training loss: {train_loss:.4} | Validation loss: {val_loss:.4} | Error rate: {1 - acc:.4}'
        )

        # State saving
        if val_loss < best_loss:
            print(
                f"Validation loss decreased {best_loss:.4} --> {val_loss:.4}: saving state..."
            )
            best_loss = val_loss
            if args.output_dir:
                torch.save(
                    dict(model=model.state_dict(),
                         optimizer=optimizer.state_dict(),
                         lr_scheduler=lr_scheduler.state_dict(),
                         epoch=epoch_idx,
                         args=args),
                    Path(args.output_dir, f"{args.checkpoint}.pth"))
def experiment_blue_print(
        output_root_dir=None,
        cv_run_num=None,
        ds_train_name=None,
        ds_test_name=None,
        ds_normalization=None,
        num_train_samples=None,
        num_augmentations=None,
        type_augmentation=None,
        num_intra_samples=None,
        model_name=None,
        batch_size=None,
        num_epochs=None,
        cls_loss_fn=None,
        lr_init=None,
        w_top_loss=None,
        top_scale=None,
        weight_decay_cls=None,
        weight_decay_feat_ext=None,
        normalize_gradient=None,
        pers_type=None,
        compute_persistence=None,
        track_model=None,
        tag=''):

    args = dict(locals())
    print(args)
    if not all(((v is not None) for k, v in args.items())):
        s = ', '.join((k for k, v in args.items() if v is None))
        raise AssertionError("Some kwargs are None: {}!".format(s))

    if w_top_loss > 0 and not compute_persistence:
        raise AssertionError('w_top_loss > 0 and compute_persistence == False')

    exp_id = get_experiment_id(tag)
    output_dir = Path(output_root_dir) / exp_id
    output_dir.mkdir()

    logger = ExperimentLogger(output_dir, args)

    track_accuracy = True

    """
    Get the splits for the training data.
    """
    DS_TRAIN_ORIGINAL_SPLITS = ds_factory_stratified_shuffle_split(
        ds_train_name, num_train_samples)
    DS_TEST_ORIGINAL = ds_factory(ds_test_name)
    assert len(DS_TRAIN_ORIGINAL_SPLITS) >= cv_run_num
    DS_TRAIN_ORIGINAL_SPLITS = DS_TRAIN_ORIGINAL_SPLITS[:cv_run_num]

    pers_fn = persistence_fn_factory(args['pers_type'])
    cls_loss_fn = cls_loss_fn_factory(args['cls_loss_fn'])

    """
    Run over the dataset splits; the splits are fixed for each number of
    training samples (500,1000,4000, etc.)
    """
    for run_i, DS_TRAIN_ORIGINAL in enumerate(DS_TRAIN_ORIGINAL_SPLITS):

        assert len(DS_TRAIN_ORIGINAL) == num_train_samples

        logger.new_run()

        dl_train, DS_TRAIN, DS_TEST, num_classes = setup_data_for_training(
            ds_train_original=DS_TRAIN_ORIGINAL,
            ds_test_original=DS_TEST_ORIGINAL,
            ds_normalization=ds_normalization,
            type_augmentation=type_augmentation,
            num_augmentations=num_augmentations,
            num_intra_samples=num_intra_samples,
            batch_size=batch_size
        )

        model = model_factory(model_name, num_classes)
        model = model.to(DEVICE)
        print(model)

        opt = torch.optim.SGD(
            [
                {'params': model.feat_ext.parameters(
                ), 'weight_decay': weight_decay_feat_ext},
                {'params': model.cls.parameters(), 'weight_decay': weight_decay_cls}
            ],
            lr=lr_init,
            momentum=0.9,
            nesterov=True)

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            opt,
            T_max=num_epochs,
            eta_min=0,
            last_epoch=-1)

        mb = master_bar(range(num_epochs))
        mb_comment = ''

        for epoch_i in mb:

            model.train()
            epoch_loss = 0

            L = len(dl_train)-1
            for b_i, ((batch_x, batch_y), _) in enumerate(zip(dl_train, progress_bar(range(L), parent=mb))):

                n = batch_x[0].size(0)
                assert n == num_intra_samples*num_augmentations
                assert all(((x.size(0) == n) for x in batch_x))

                x, y = torch.cat(batch_x, dim=0), torch.cat(batch_y, dim=0)
                x, y = x.to(DEVICE), y.to(DEVICE)

                y_hat, z = model(x)
                l_cls = cls_loss_fn(y_hat, y)

                l_top = torch.tensor(0.0).to(DEVICE)

                if compute_persistence:
                    for i in range(batch_size):
                        z_sample = z[i*n: (i+1)*n, :].contiguous()
                        lt = pers_fn(z_sample)[0][0][:, 1]

                        logger.log_value('batch_lt', lt)
                        l_top = l_top + (lt-top_scale).abs().sum()
                    l_top = l_top / float(batch_size)

                l = l_cls + w_top_loss * l_top

                opt.zero_grad()
                l.backward()

                # gradient norm and normalization aa
                grad_vec_abs = torch.cat(
                    [p.grad.data.view(-1) for p in model.parameters()], dim=0).abs()

                grad_norm = grad_vec_abs.pow(2).sum().sqrt().item()

                if grad_norm > 0 and normalize_gradient:
                    for p in model.parameters():
                        p.grad.data /= grad_norm

                opt.step()

                epoch_loss += l.item()
                logger.log_value('batch_cls_loss', l_cls)
                logger.log_value('batch_top_loss', l_top)

                logger.log_value('batch_grad_norm', grad_norm)
                logger.log_value('batch_grad_abs_max', grad_vec_abs.max())
                logger.log_value('batch_grad_abs_min', grad_vec_abs.min())
                logger.log_value('batch_grad_abs_mean', grad_vec_abs.mean())
                logger.log_value('batch_grad_abs_std', grad_vec_abs.std())

                logger.log_value('lr', scheduler.get_last_lr()[0])
                logger.log_value(
                    'cls_norm', model.cls[0].weight.data.view(-1).norm())

            scheduler.step()

            mb_comment = "Last loss: {:.2f} {:.4f} ".format(
                epoch_loss,
                w_top_loss)

            track_accuracy = True
            if track_accuracy:

                X, Y = apply_model(model, DS_TRAIN, device=DEVICE)
                acc_train = argmax_and_accuracy(X, Y)
                logger.log_value('acc_train', acc_train)
                mb_comment += " | acc. train {:.2f} ".format(acc_train)

                X, Y = apply_model(model, DS_TEST, device=DEVICE)
                acc_test = argmax_and_accuracy(X, Y)
                logger.log_value('acc_test', acc_test)
                mb_comment += " | acc. test {:.2f} ".format(acc_test)

                logger.log_value('epoch_i', epoch_i)

                mb.main_bar.comment = mb_comment

            logger.write_logged_values_to_disk()

            if track_model:
                logger.write_model_to_disk('model_epoch_{}'.format(epoch_i),
                                           model)

        logger.write_model_to_disk('model', model)
Beispiel #26
0
                                 3,
                                 skip=0,
                                 is_transform=True,
                                 crop=CROP_SIZE)
    train = torch.utils.data.DataLoader(train_folder,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                        num_workers=12)
    valid = torch.utils.data.DataLoader(valid_folder,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                        num_workers=12)
    lr = INIT_LR
    #loss = torch.nn.MSELoss(reduction='mean')
    loss = torch.nn.MSELoss(reduction='mean')
    mb = master_bar(range(EPOCHS))
    best_loss = float('inf')
    valid_loss = float('inf')

    count = 0
    sum_loss = 0
    model = model.eval()
    for X, Y in progress_bar(valid, parent=mb, txt_len=100):
        X = X.cuda()  #*2/255-1
        #E = E.cuda()
        Y = Y.cuda()  #*2/255-1
        with torch.set_grad_enabled(False):
            out = model(X, None)
            l = loss(out, Y) * X.size()[0]
            count += X.size()[0]
            sum_loss += l
def train_model(x_trn, x_val, config, num_classes, weights, device):

    y_gr_val = x_val['grapheme_root']
    y_vo_val = x_val['vowel_diacritic']
    y_co_val = x_val['consonant_diacritic']

    model_params = config.model_params

    train_dataset = BengaliDataset(x_trn,
                                   n_channels=model_params.n_channels,
                                   img_size=config.img_size,
                                   transforms=config.augmentation)
    valid_dataset = BengaliDataset(x_val,
                                   n_channels=model_params.n_channels,
                                   img_size=config.img_size,
                                   transforms=None)

    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch_size,
                              shuffle=True,
                              num_workers=3)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.batch_size,
                              shuffle=False,
                              num_workers=3)

    del train_dataset, valid_dataset
    gc.collect()

    if 'se_resnext' in model_params.model_name:
        model = SeNet(model_name=model_params.model_name,
                      n_channels=model_params.n_channels,
                      n_classes=model_params.n_classes,
                      pretrained=model_params.pretrained).to(device)

    elif 'resnetd' in model_params.model_name:
        model = ResNetD(model_name=model_params.model_name,
                        n_channels=model_params.n_channels,
                        n_classes=model_params.n_classes).to(device)

    elif 'resne' in model_params.model_name:
        model = ResNet(model_name=model_params.model_name,
                       n_channels=model_params.n_channels,
                       n_classes=model_params.n_classes,
                       pretrained=model_params.pretrained).to(device)

    elif 'densenet' in model_params.model_name:
        model = DenseNet(model_name=model_params.model_name,
                         n_channels=model_params.n_channels,
                         n_classes=model_params.n_classes,
                         pretrained=model_params.pretrained).to(device)

    elif 'efficient' in model_params.model_name:
        model = ENet(model_name=model_params.model_name,
                     n_channels=model_params.n_channels,
                     n_classes=model_params.n_classes,
                     pretrained=model_params.pretrained).to(device)

    if config.model_state_fname is not None:
        model.load_state_dict(
            torch.load(f'../logs/{config.model_state_fname}/weight_best.pt'))

    # relu_replace(model)
    # bn_replace(model)

    weights_gr = torch.from_numpy(weights['grapheme_root']).cuda()
    weights_vo = torch.from_numpy(weights['vowel_diacritic']).cuda()
    weights_co = torch.from_numpy(weights['consonant_diacritic']).cuda()
    if config.loss == 'CrossEntropyLoss':
        # criterion_gr = nn.CrossEntropyLoss(weight=weights_gr)
        # criterion_vo = nn.CrossEntropyLoss(weight=weights_vo)
        # criterion_co = nn.CrossEntropyLoss(weight=weights_co)
        criterion_gr = nn.CrossEntropyLoss()
        criterion_vo = nn.CrossEntropyLoss()
        criterion_co = nn.CrossEntropyLoss()
    elif config.loss == 'SmoothCrossEntropyLoss':
        criterion_gr = SmoothCrossEntropyLoss()
        criterion_vo = SmoothCrossEntropyLoss()
        criterion_co = SmoothCrossEntropyLoss()
    elif config.loss == 'FocalLoss':
        criterion_gr = FocalLoss()
        criterion_vo = FocalLoss()
        criterion_co = FocalLoss()
    elif config.loss == 'ClassBalancedLoss':
        criterion_gr = ClassBalancedLoss(samples_per_cls=weights_gr,
                                         no_of_classes=num_classes[0],
                                         loss_type='focal',
                                         beta=0.999,
                                         gamma=2.0)
        criterion_vo = ClassBalancedLoss(samples_per_cls=weights_vo,
                                         no_of_classes=num_classes[1],
                                         loss_type='focal',
                                         beta=0.999,
                                         gamma=2.0)
        criterion_co = ClassBalancedLoss(samples_per_cls=weights_co,
                                         no_of_classes=num_classes[2],
                                         loss_type='focal',
                                         beta=0.999,
                                         gamma=2.0)
    elif config.loss == 'OhemLoss':
        criterion_gr = OhemLoss(rate=1.0)
        criterion_vo = OhemLoss(rate=1.0)
        criterion_co = OhemLoss(rate=1.0)

    if config.optimizer.type == 'Adam':
        optimizer = Adam(params=model.parameters(),
                         lr=config.optimizer.lr,
                         amsgrad=False,
                         weight_decay=1e-4)
    elif config.optimizer.type == 'SGD':
        optimizer = SGD(params=model.parameters(),
                        lr=config.optimizer.lr,
                        momentum=0.9,
                        weight_decay=1e-4,
                        nesterov=True)

    scheduler_flg = False
    if config.scheduler.type == 'cosine':
        scheduler_flg = True
        scheduler = CosineAnnealingLR(optimizer,
                                      T_max=config.scheduler.t_max,
                                      eta_min=config.scheduler.eta_min)
    elif config.scheduler.type == 'cosine-warmup':
        scheduler_flg = True
        scheduler = CosineAnnealingWarmUpRestarts(
            optimizer,
            T_0=config.scheduler.t_0,
            T_mult=config.scheduler.t_mult,
            eta_max=config.scheduler.eta_max,
            T_up=config.scheduler.t_up,
            gamma=config.scheduler.gamma)
    elif config.scheduler.type == 'step':
        scheduler_flg = True
        scheduler = StepLR(optimizer,
                           step_size=config.scheduler.step_size,
                           gamma=config.scheduler.gamma)
    elif config.scheduler.type == 'reduce':
        scheduler_flg = True
        scheduler = ReduceLROnPlateau(optimizer,
                                      factor=config.scheduler.factor,
                                      patience=config.scheduler.patience,
                                      min_lr=config.scheduler.min_lr)

    best_epoch = -1
    best_val_score = -np.inf
    mb = master_bar(range(config.epochs))

    train_loss_list = []
    val_loss_list = []
    val_score_list = []

    counter = 0

    for epoch in mb:
        start_time = time.time()
        model.train()
        avg_loss = 0.

        for images, labels_gr, labels_vo, labels_co in progress_bar(
                train_loader, parent=mb):
            images = Variable(images).to(device)
            labels_gr = Variable(labels_gr).to(device)
            labels_vo = Variable(labels_vo).to(device)
            labels_co = Variable(labels_co).to(device)

            if config.loss == 'OhemLoss':
                if epoch < config.epochs * 0.2:
                    new_rate = 1.0
                elif epoch < config.epochs * 0.4:
                    new_rate = 0.8
                elif epoch < config.epochs * 0.6:
                    new_rate = 0.75
                elif epoch < config.epochs * 0.8:
                    new_rate = 0.7
                else:
                    new_rate = 0.6

                criterion_gr.update_rate(new_rate)
                criterion_vo.update_rate(new_rate)
                criterion_co.update_rate(new_rate)

            r = np.random.rand()
            mix_params = config.augmentation.mix_params
            if r < mix_params.mixup:
                images, targets = mixup(images, labels_gr, labels_vo,
                                        labels_co, 1.0)
                preds_gr, preds_vo, preds_co = model(images)
                loss = mixup_criterion(preds_gr, preds_vo, preds_co, targets,
                                       criterion_gr, criterion_vo,
                                       criterion_co)
            elif r < (mix_params.mixup + mix_params.cutmix):
                images, targets = cutmix(images, labels_gr, labels_vo,
                                         labels_co, 1.0)
                preds_gr, preds_vo, preds_co = model(images)
                loss = cutmix_criterion(preds_gr, preds_vo, preds_co, targets,
                                        criterion_gr, criterion_vo,
                                        criterion_co)
            else:
                preds_gr, preds_vo, preds_co = model(images.float())
                loss = criterion_gr(preds_gr, labels_gr) \
                       + criterion_vo(preds_vo, labels_vo) \
                       + criterion_co(preds_co, labels_co)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        train_loss_list.append(avg_loss)

        model.eval()
        valid_gr_preds = np.zeros((len(valid_loader.dataset), num_classes[0]))
        valid_vo_preds = np.zeros((len(valid_loader.dataset), num_classes[1]))
        valid_co_preds = np.zeros((len(valid_loader.dataset), num_classes[2]))
        avg_val_loss = 0.

        for i, (images, labels_gr, labels_vo,
                labels_co) in enumerate(valid_loader):
            images = Variable(images).to(device)
            labels_gr = Variable(labels_gr).to(device)
            labels_vo = Variable(labels_vo).to(device)
            labels_co = Variable(labels_co).to(device)

            preds_gr, preds_vo, preds_co = model(images.float())

            loss_gr = criterion_gr(preds_gr, labels_gr)
            loss_vo = criterion_vo(preds_vo, labels_vo)
            loss_co = criterion_co(preds_co, labels_co)

            valid_gr_preds[i * config.batch_size:(
                i + 1) * config.batch_size] = preds_gr.cpu().detach().numpy()
            valid_vo_preds[i * config.batch_size:(
                i + 1) * config.batch_size] = preds_vo.cpu().detach().numpy()
            valid_co_preds[i * config.batch_size:(
                i + 1) * config.batch_size] = preds_co.cpu().detach().numpy()
            avg_val_loss += (loss_gr.item() + loss_vo.item() +
                             loss_co.item()) / len(valid_loader)

        recall_gr = recall_score(y_gr_val,
                                 np.argmax(valid_gr_preds, axis=1),
                                 average='macro')
        recall_vo = recall_score(y_vo_val,
                                 np.argmax(valid_vo_preds, axis=1),
                                 average='macro')
        recall_co = recall_score(y_co_val,
                                 np.argmax(valid_co_preds, axis=1),
                                 average='macro')

        val_score = np.average([recall_gr, recall_vo, recall_co],
                               weights=[2, 1, 1])

        val_loss_list.append(avg_val_loss)
        val_score_list.append(val_score)

        if scheduler_flg and config.scheduler.type != 'reduce':
            scheduler.step()
        elif scheduler_flg and config.scheduler.type == 'reduce':
            scheduler.step(avg_val_loss)

        elapsed = time.time() - start_time
        mb.write(
            f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} val_gr_score: {recall_gr:.4f} val_vo_score: {recall_vo:.4f} val_co_score: {recall_co:.4f} time: {elapsed:.0f}s'
        )
        logging.debug(
            f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} val_gr_score: {recall_gr:.4f} val_vo_score: {recall_vo:.4f} val_co_score: {recall_co:.4f} time: {elapsed:.0f}s'
        )

        if best_val_score < val_score:
            best_epoch = epoch + 1
            best_val_score = val_score
            best_recall_gr = recall_gr
            best_recall_vo = recall_vo
            best_recall_co = recall_co
            best_valid_gr_preds = valid_gr_preds
            best_valid_vo_preds = valid_vo_preds
            best_valid_co_preds = valid_co_preds
            best_model = model.state_dict()
            counter = 0

        counter += 1
        if counter == config.early_stopping:
            break

    print('\n\n===================================\n')
    print(f'CV: {best_val_score}\n')
    print(f'BEST EPOCH: {best_epoch}')
    print(f'BEST RECALL GR: {best_recall_gr}')
    print(f'BEST RECALL VO: {best_recall_vo}')
    print(f'BEST RECALL CO: {best_recall_co}')
    logging.debug(f'\n\nCV: {best_val_score}\n')
    logging.debug(f'BEST EPOCH: {best_epoch}')
    logging.debug(f'BEST RECALL GR: {best_recall_gr}')
    logging.debug(f'BEST RECALL VO: {best_recall_vo}')
    logging.debug(f'BEST RECALL CO: {best_recall_co}\n\n')
    print('\n===================================\n\n')

    return best_model, [
        best_valid_gr_preds, best_valid_vo_preds, best_valid_co_preds
    ], best_val_score, train_loss_list, val_loss_list, val_score_list
Beispiel #28
0
def train_model(x_train, y_train, train_transforms):
    num_epochs = 80
    batch_size = 64
    test_batch_size = 256
    lr = 3e-3
    eta_min = 1e-5
    t_max = 10

    num_classes = y_train.shape[1]

    x_trn, x_val, y_trn, y_val = train_test_split(x_train,
                                                  y_train,
                                                  test_size=0.2,
                                                  random_state=SEED)

    train_dataset = FATTrainDataset(x_trn, y_trn, train_transforms)
    valid_dataset = FATTrainDataset(x_val, y_val, train_transforms)

    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=test_batch_size,
                              shuffle=False)

    model = Classifier(num_classes=num_classes).cuda()
    criterion = nn.BCEWithLogitsLoss().cuda()
    optimizer = Adam(params=model.parameters(), lr=lr, amsgrad=False)
    scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min)

    best_epoch = -1
    best_lwlrap = 0.
    mb = master_bar(range(num_epochs))

    for epoch in mb:
        start_time = time.time()
        model.train()
        avg_loss = 0.

        for x_batch, y_batch in progress_bar(train_loader, parent=mb):
            preds = model(x_batch.cuda())
            loss = criterion(preds, y_batch.cuda())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            avg_loss += loss.item() / len(train_loader)

        model.eval()
        valid_preds = np.zeros((len(x_val), num_classes))
        avg_val_loss = 0.

        for i, (x_batch, y_batch) in enumerate(valid_loader):
            preds = model(x_batch.cuda()).detach()
            loss = criterion(preds, y_batch.cuda())

            preds = torch.sigmoid(preds)
            valid_preds[i * test_batch_size:(i + 1) *
                        test_batch_size] = preds.cpu().numpy()

            avg_val_loss += loss.item() / len(valid_loader)

        score, weight = calculate_per_class_lwlrap(y_val, valid_preds)
        lwlrap = (score * weight).sum()

        scheduler.step()

        if (epoch + 1) % 5 == 0:
            elapsed = time.time() - start_time
            mb.write(
                f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  val_lwlrap: {lwlrap:.6f}  time: {elapsed:.0f}s'
            )

        if lwlrap > best_lwlrap:
            best_epoch = epoch + 1
            best_lwlrap = lwlrap
            torch.save(model.state_dict(), 'weight_best.pt')

    return {
        'best_epoch': best_epoch,
        'best_lwlrap': best_lwlrap,
    }
Beispiel #29
0
                              args,
                              abs_,
                              optimize_both_exp=False,
                              batchnorm=True,
                              prior_var=80,
                              device=device)
model = model.to(device)
model.apply(EXPVAEWAVE.weight_init)
optimizer = optim.Adam(list(model.parameters()) + [model.exps],
                       lr=args.learning_rate,
                       weight_decay=0,
                       betas=(args.beta_one, args.beta_two))

train_losses = []
n_epochs = args.epochs
mb = master_bar(range(20))
y_ax_index = 0
for i in mb:
    epoch = i
    for j in progress_bar(range(int(n_epochs / 20)), parent=mb):
        model = EXPVAEWAVE.train(model, device, args, optimizer, train_loader,
                                 epoch, train_losses)
    # x = range(len(train_losses))
    # y = train_losses
    # graphs = [[x,y]]
    # y_bounds = [0,train_losses[0]]
    # mb.update_graph(graphs, y_bounds=y_bounds)
    mb.write(f'Avg. Training Loss:  {train_losses[-1]}.')

# To get the location estimates for each spike in the recording, we run them through the inference network and then average the location estimates belonging to the same event (this is described in the manuscript in the amplitude jitter portion of the paper).
Beispiel #30
0
def main():
    n_epoch = 10

    now = datetime.datetime.now()
    now_date = '{}-{:0>2d}-{:0>2d}_{:0>2d}-{:0>2d}-{:0>2d}'.format(
        now.year, now.month, now.day, now.hour, now.minute, now.second)

    # set logger
    logger = logging.getLogger("Log")
    logger.setLevel(logging.DEBUG)

    handler_format = logging.Formatter(
        '%(asctime)s - %(levelname)s - %(message)s')

    print('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day, now.hour,
                                     now.minute, now.second))
    with open('../params/stacking.yaml', "r+") as f:
        param = yaml.load(f, Loader=yaml.FullLoader)

    seed_setting(param['seed'])
    if torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
    local_cv = dict()

    for fold in range(5):
        outdir = os.path.join(param['save path'], EXP_NAME, now_date,
                              'fold{}'.format(fold))
        if os.path.exists(param['save path']):
            os.makedirs(outdir, exist_ok=True)
        else:
            print("Not find {}".format(param['save path']))
            raise FileNotFoundError

        file_handler = logging.FileHandler(
            os.path.join(outdir, 'experiment.log'))
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(handler_format)
        logger.addHandler(file_handler)

        logger.debug('=============   FOLD  {}  ============='.format(fold))
        logger.debug('{}-{}-{} {}:{}:{}'.format(now.year, now.month, now.day,
                                                now.hour, now.minute,
                                                now.second))

        print(f'fold - {fold}')
        print('load data set')
        train_dataset = StackingDataset(
            df=get_train_df(param['tabledir']).query('valid != @fold'),
            logit_path='/mnt/hdd1/alcon2019/logits_for_oof.pth',
            mode='train')
        valid_dataset = StackingDataset(
            df=get_train_df(param['tabledir']).query('valid == @fold'),
            logit_path='/mnt/hdd1/alcon2019/logits_for_oof.pth',
            mode='valid')

        print('load data loader')
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=param['batch size'],
                                      num_workers=param['thread'],
                                      pin_memory=False,
                                      drop_last=False,
                                      shuffle=True)
        valid_dataloader = DataLoader(valid_dataset,
                                      batch_size=param['batch size'],
                                      num_workers=param['thread'],
                                      pin_memory=False,
                                      drop_last=False,
                                      shuffle=False)

        print('model set')
        model = MLP()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

        model = model.to(param['device'])
        loss_fn = nn.CrossEntropyLoss().to(param['device'])
        eval_fn = accuracy_one_character

        max_char_acc = -1e-5
        max_3char_acc = -1e-5
        min_loss = 1e+5

        mb = master_bar(range(n_epoch))

        for epoch in mb:
            model.train()
            avg_train_loss = 10**5
            avg_train_accuracy = 0.0
            avg_three_train_acc = 0.0
            for step, (inputs, targets, indice) in enumerate(
                    progress_bar(train_dataloader, parent=mb)):
                model.train()
                inputs = inputs.to(param['device'])
                targets = targets.to(param['device'])
                optimizer.zero_grad()
                logits = model(inputs)  # logits.size() = (batch*3, 48)
                preds = logits.view(targets.size(0), 3, -1).softmax(dim=2)
                loss = loss_fn(logits,
                               targets.view(-1, targets.size(2)).argmax(dim=1))
                loss.backward()
                avg_train_loss += loss.item()
                _avg_accuracy = eval_fn(preds, targets.argmax(dim=2)).item()
                avg_train_accuracy += _avg_accuracy
                _three_char_accuracy = accuracy_three_character(
                    preds, targets.argmax(dim=2), mean=True).item()
                avg_three_train_acc += _three_char_accuracy

            avg_train_loss /= len(train_dataloader)
            avg_train_accuracy /= len(train_dataloader)
            avg_three_train_acc /= len(train_dataloader)

            avg_valid_loss, avg_valid_accuracy, avg_three_valid_acc = valid_alcon_rnn(
                model, valid_dataloader, param['device'], loss_fn, eval_fn)

            if min_loss > avg_valid_loss:
                logger.debug('update best loss:  {:.5f} ---> {:.5f}'.format(
                    min_loss, avg_valid_loss))
                min_loss = avg_valid_loss
                torch.save(model.state_dict(),
                           os.path.join(outdir, 'best_loss.pth'))

            if max_char_acc < avg_valid_accuracy:
                logger.debug(
                    'update best acc per 1 char:  {:.3%} ---> {:.3%}'.format(
                        max_char_acc, avg_valid_accuracy))
                max_char_acc = avg_valid_accuracy
                torch.save(model.state_dict(),
                           os.path.join(outdir, 'best_acc.pth'))

            if max_3char_acc < avg_three_valid_acc:
                logger.debug(
                    'update best acc per 3 char:  {:.3%} ---> {:.3%}'.format(
                        max_3char_acc, avg_three_valid_acc))
                max_3char_acc = avg_three_valid_acc
                torch.save(model.state_dict(),
                           os.path.join(outdir, 'best_3acc.pth'))

            logger.debug(
                '======================== epoch {} ========================'.
                format(epoch + 1))
            logger.debug('lr              : {:.5f}'.format(
                scheduler.get_lr()[0]))
            logger.debug(
                'loss            : train={:.5f}  , test={:.5f}'.format(
                    avg_train_loss, avg_valid_loss))
            logger.debug(
                'acc(per 1 char) : train={:.3%}  , test={:.3%}'.format(
                    avg_train_accuracy, avg_valid_accuracy))
            logger.debug(
                'acc(per 3 char) : train={:.3%}  , test={:.3%}'.format(
                    avg_three_train_acc, avg_three_valid_acc))

        logger.debug('================  FINISH  TRAIN  ================')
        logger.debug('Result')
        logger.debug('Best loss : {}'.format(min_loss))
        logger.debug('Best 1 acc : {}'.format(max_char_acc))
        logger.debug('Best 3 acc : {}'.format(max_3char_acc))