Example #1
0
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        # 一致しないものがある.
        # https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/142011
        df_train.loc[df_train['sentiment'] == 'neutral',
                     'selected_text'] = df_train[df_train['sentiment'] ==
                                                 'neutral']['text']

        num_folds = config.NUM_FOLDS
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(kf.split(X=df_train, y=df_train[['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        model = TweetRoBERTaModel(config.ROBERTA_PATH)
        model = model.to(device)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters, lr=3e-5)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        model = nn.DataParallel(model)

        # pretrain_path = 'models/exp11_fold0.pth'
        # model.load_state_dict(torch.load(pretrain_path))
        # LOGGER.info(f'pretrained model (exp11) loaded')

    with timer('training loop'):
        best_score = -999
        best_epoch = 0
        patience = 3
        p = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score = engine.eval_fn(val_loader, model, device)

            LOGGER.info(f"Jaccard Score = {score}")

            if score > best_score:
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))
                p = 0

            if p > 0:
                LOGGER.info(
                    f'best score is not updated while {p} epochs of training')
            p += 1
            if p > patience:
                LOGGER.info(f'Early Stopping')
                break

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
Example #2
0
    return mod


# replace relu to prelu
def convert_model_ReLU2PReLU(module):
    mod = module
    if isinstance(module, torch.nn.ReLU):
        mod = nn.PReLU()
    for name, child in module.named_children():
        mod.add_module(name, convert_model_ReLU2PReLU(child))
    return mod


batch_size_list = [36, 42, 64]

with timer('load csv data'):
    fold_id = 0
    epochs = 45
    batch_size = batch_size_list[0]

    train = pd.read_csv('input/train.csv')

    y = train[["grapheme_root", "vowel_diacritic", "consonant_diacritic"]]

    num_folds = 5
    kf = MultilabelStratifiedKFold(n_splits=num_folds, random_state=SEED)
    splits = list(kf.split(X=train, y=y))
    train_idx = splits[fold_id][0]
    val_idx = splits[fold_id][1]

    gc.collect()
Example #3
0

def keroppinet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResMagicNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        raise NotImplementedError()
    return model


batch_size_list = [36, 42, 64]

with timer('load csv data'):
    fold_id = 0
    epochs = 45
    batch_size = batch_size_list[1]

    train = pd.read_csv('input/train.csv')

    y = train[["grapheme_root", "vowel_diacritic", "consonant_diacritic"]]

    num_folds = 5

    kf = MultilabelStratifiedKFold(n_splits=num_folds, random_state=SEED)
    splits = list(kf.split(X=train, y=y))
    train_idx = splits[fold_id][0]
    val_idx = splits[fold_id][1]
    # train_idx, val_idx = train_test_split(train.index.tolist(), test_size=0.15, random_state=SEED, stratify=train["vowel_diacritic"])
Example #4
0
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        # df_train['text'] = df_train['text'].apply(lambda x: remove_initial_white_space(x))
        df_train['text'] = df_train['text'].apply(
            lambda x: ' '.join(x.split()))

        # neutral_texts = df_train[df_train['sentiment']=='neutral']['text']
        # neutral_selected_texts = df_train[df_train['sentiment']=='neutral']['selected_text']
        # df_train['is_text_start_with_white_space'] = df_train['text'].apply(lambda x: check_initial_white_space(x))
        # df_train['text_equal_selected_text'] = neutral_texts == neutral_selected_texts
        # df_pos = df_train[df_train['sentiment']=='positive']
        # df_neg = df_train[df_train['sentiment']=='negative']

        # neutral の中でも全文が selected されていないサンプル群
        # df_neutral_use = df_train[(df_train['text_equal_selected_text']==0)&(df_train['is_text_start_with_white_space']==0)]
        # df_train = pd.concat([df_pos, df_neg, df_neutral_use]).reset_index(drop=True)

        df_train['text_token_len'] = df_train['text'].apply(
            lambda x: len(config.TOKENIZER.encode(x).ids))
        df_train['selected_text_token_len'] = df_train['selected_text'].apply(
            lambda x: len(config.TOKENIZER.encode(x).ids))
        df_train['total_token_len'] = df_train['text_token_len'] + df_train[
            'selected_text_token_len']
        df_train = df_train[df_train['total_token_len'] < 97].reset_index(
            drop=True)

        # df_train = df_train[df_train['sentiment']!='neutral'].reset_index(drop=True)

        num_folds = config.NUM_FOLDS
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(kf.split(X=df_train, y=df_train[['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # model_config = transformers.RobertaConfig.from_pretrained(roberta_path)
        model = transformers.RobertaForQuestionAnswering.from_pretrained(
            'roberta-base')
        # model = TweetRoBERTaModelSimple(config.ROBERTA_PATH)
        model = model.to(device)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters,
                                       lr=3e-5,
                                       correct_bias=False)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        model = nn.DataParallel(model)

        # pretrain_path = 'models/exp11_fold0.pth'
        # model.load_state_dict(torch.load(pretrain_path))
        # LOGGER.info(f'pretrained model (exp11) loaded')

    with timer('training loop'):
        min_loss = 999
        best_score = -999
        best_epoch = 0
        patience = 3
        p = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score, val_loss = engine.eval_fn(val_loader, model, device)

            if val_loss < min_loss:
                min_loss = val_loss
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))
                p = 0

            if p > 0:
                LOGGER.info(
                    f'min loss is not updated while {p} epochs of training')
            p += 1
            if p > patience:
                LOGGER.info(f'Early Stopping')
                break

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
Example #5
0
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        num_folds = 5
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(kf.split(X=df_train, y=df_train[['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        model = BERTBaseUncased()
        model = model.to(device)

        # t_max=10
        # scheduler_cosine = CosineAnnealingLR(optimizer, T_max=t_max)
        # scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=5,
        #                                    after_scheduler=scheduler_cosine)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters, lr=5e-5)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        model = nn.DataParallel(model)

    with timer('training loop'):
        best_score = -999
        best_epoch = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score, val_outputs = engine.eval_fn(val_loader, model, device)

            LOGGER.info(f"Jaccard Score = {score}")

            if score > best_score:
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                to_pickle(
                    os.path.join(config.OUT_DIR,
                                 "{}_fold{}_oof.pkl".format(EXP_ID, fold_id)),
                    [val_idx, val_outputs])
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        not_neutral_df = df_train[df_train['sentiment'] != 'neutral']
        df_train['is_text_start_with_white_space'] = df_train['text'].apply(
            lambda x: check_initial_white_space(x))
        neutral_texts = df_train[df_train['sentiment'] == 'neutral']['text']
        neutral_selected_texts = df_train[df_train['sentiment'] ==
                                          'neutral']['selected_text']
        df_train[
            'text_equal_selected_text'] = neutral_texts == neutral_selected_texts

        special_df = df_train[(df_train['text_equal_selected_text'] == 0) & (
            df_train['is_text_start_with_white_space'] == 0)]
        neutral_df_a = df_train[(df_train['text_equal_selected_text'] == 0) & (
            df_train['is_text_start_with_white_space'] == 1)]
        neutral_df_b = df_train[(df_train['text_equal_selected_text'] == 1) & (
            df_train['is_text_start_with_white_space'] == 0)]

        df_train = pd.concat([not_neutral_df, neutral_df_a,
                              neutral_df_b]).reset_index(drop=True)
        len_df_train = len(df_train)
        df_train = pd.concat([df_train, special_df]).reset_index(drop=True)

        num_folds = config.NUM_FOLDS
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(
            kf.split(X=df_train[:len_df_train],
                     y=df_train[:len_df_train][['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        train_idx = np.concatenate([
            train_idx,
            np.array([
                i for i in range(len_df_train, len_df_train + len(special_df))
            ])
        ])
        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        model = TweetRoBERTaModel(config.ROBERTA_PATH)
        # model = TweetRoBERTaModelSimple(config.ROBERTA_PATH)
        # model = TweetRoBERTaModelConv1dHeadV2(config.ROBERTA_PATH)
        model = model.to(device)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters,
                                       lr=3e-5,
                                       correct_bias=False)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        # model = nn.DataParallel(model)

        # pretrain_path = 'models/exp11_fold0.pth'
        # model.load_state_dict(torch.load(pretrain_path))
        # LOGGER.info(f'pretrained model (exp11) loaded')

    with timer('training loop'):
        best_score = -999
        best_epoch = 0
        patience = 3
        p = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score, val_loss = engine.eval_fn(val_loader, model, device)

            if score > best_score:
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))
                p = 0

            if p > 0:
                LOGGER.info(
                    f'best score is not updated while {p} epochs of training')
            p += 1
            if p > patience:
                LOGGER.info(f'Early Stopping')
                break

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
Example #7
0
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        num_folds = 5
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(kf.split(X=df_train, y=df_train[['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        #model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
        model_config = transformers.BertConfig.from_pretrained(
            "bert-large-uncased-whole-word-masking", output_hidden_states=True)
        # model_config.output_hidden_states = True

        model = TweetModelLargeWWM("bert-large-uncased-whole-word-masking",
                                   model_config)
        model = model.to(device)

        # t_max=10
        # scheduler_cosine = CosineAnnealingLR(optimizer, T_max=t_max)
        # scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=5,
        #                                    after_scheduler=scheduler_cosine)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters, lr=3e-5)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        model = nn.DataParallel(model)

        # https://www.kaggle.com/irustandi/bertlargeuncasedwwmfinetunedsquad
        # pretrain_path = 'inputs/bert-large-uncased-wwm-finetuned-squad/pytorch_model.bin'
        # model.load_state_dict(torch.load(pretrain_path))
        # LOGGER.info(f'pretrained model (WWM uncased squad) loaded')

    with timer('training loop'):
        best_score = -999
        best_epoch = 0
        patience = 2
        p = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score = engine.eval_fn(val_loader, model, device)

            LOGGER.info(f"Jaccard Score = {score}")

            if score > best_score:
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))
                p = 0

            if p > 0:
                LOGGER.info(
                    f'best score is not updated while {p} epochs of training')
            p += 1
            if p > patience:
                LOGGER.info(f'Early Stopping')
                break

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))