Example #1
0
def notify_results():
    competition_name = 'SIIM\n'
    model_name = submission_file
    comment = 'version={}'.format(args.version)
    message = competition_name + '\n' + model_name + '\n' + comment + '\n'
    send_line_notification(message=message)

    print(message)
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            config_path=MODEL_CONFIG_PATH,
            state_dict=state_dict,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader,
                                       DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Example #3
0
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f}'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)


if __name__ == '__main__':
    args = parse_args(None)
    log_file = f'{EXP_ID}.log'
    logger = getLogger(__name__)
    logger = logInit(logger, f'{MNT_DIR}/logs/', log_file)
    sel_log(f'args: {sorted(vars(args).items())}', logger)

    send_line_notification(f' ------------- start {EXP_ID} ------------- ')
    main(args, logger)
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl')
    # aug_df['is_original'] = 0

    # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True)

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            continue
        fold_trn_df = trn_df.iloc[trn_idx]# .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(list(itertools.chain.from_iterable(
            fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
            fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
            fold_trn_df.answer.apply(lambda x: x.split(' '))
        ))).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        model = BertModelForBinaryMultiLabelClassifier(num_labels=30,
                                                       pretrained_model_name_or_path=MODEL_PRETRAIN,
                                                       # cat_num=5
                                                       )
        model.resize_token_embeddings(len(trn_dataset.tokenizer))
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = model.to(DEVICE)
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [trn_loss, ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [val_loss, ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [val_metric, ]
            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f}',
                logger)
            model = model.to('cpu')
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric)
        save_and_clean_for_prediction(
            f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
            trn_dataset.tokenizer)
        fold_metric = histories["val_metric"][fold]
        fold_metric_mean = np.mean(fold_metric)
        fold_metric_std = np.std(fold_metric)
        fold_stats = f'{fold_metric_mean:.4f} +- {fold_metric_std:.4f}'
        sel_log(fold_stats, logger)
        send_line_notification(fold_stats)
        del model
    sel_log('now saving best checkpoints...', logger)
Example #5
0
        test_df[i] = model.predict_proba(X_tst)[:, 1]
        if feat_df is None:
            feat_df = pd.DataFrame(index=X_trn.columns)
        feat_df[i] = model.feature_importances_

val_df = pd.DataFrame({
    'TARGET': y_train,
    'p': val_series
}).to_csv(OUTPUT / f'{NAME}_cv_pred.csv', index=False)
valid_score = np.mean(cv_results)

message = f"""cv: {valid_score: .5f}
feats: {feats}
model_params: {lgb_params}
fit_params: {fit_params}"""
send_line_notification(message)
print('=' * 60)
print(message)
print('=' * 60)

if rank_average:
    pred = test_df.apply(lambda x: rankdata(x) / len(x)).mean(axis=1).ravel()
else:
    pred = test_df.mean(axis=1).ravel()
generate_submit(pred, f'{NAME}_{valid_score:.5f}')

print('output feature importances')
feat_df.mean(axis=1).sort_values(ascending=False).to_csv(OUTPUT /
                                                         f'{NAME}_feat.csv')
imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50]
imp[::-1].plot.barh(figsize=(20, 15))
def main(args, logger):
    trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body)

    histories = {
        'trn_loss': [],
        'val_loss': [],
        'val_metric': [],
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            continue
        trn_qa_ids = trn_df.iloc[trn_idx].qa_id
        val_qa_ids = trn_df.iloc[val_idx].qa_id
        if args.debug:
            trn_qa_ids = trn_qa_ids.sample(300, random_state=71)
            val_qa_ids = val_qa_ids.sample(300, random_state=71)

        trn_dataset = QUESTDataset(
            mode='train',
            qa_ids=trn_qa_ids,
            augment=[],
            pretrained_model_name_or_path=PRETRAIN,
            data_path=f'{MNT_DIR}/inputs/origin/')
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            mode='valid',
            qa_ids=val_qa_ids,
            augment=[],
            pretrained_model_name_or_path='bert-base-uncased',
            data_path=f'{MNT_DIR}/inputs/origin/')
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = soft_binary_cross_entropy
        model = BertModelForBinaryMultiLabelClassifier(num_labels=30,
                                                       pretrained_model_name_or_path=PRETRAIN
                                                       ).to(DEVICE)
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test(
                model, val_loader)

            scheduler.step()
            histories['trn_loss'].append(trn_loss)
            histories['val_loss'].append(val_loss)
            histories['val_metric'].append(val_metric)
            sel_log(
                f'{trn_loss.detach().to("cpu").numpy()} -- '
                f'{val_loss.detach().to("cpu").numpy()} -- '
                f'{val_metric}',
                logger)
            save_checkpoint(
                f'{MNT_DIR}/checkpoints',
                EXP_ID,
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric)
        del model
        send_line_notification(f'finished fold {fold}')
    sel_log('now saving best checkpoints...', logger)
    save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/e002/', val_dataset.tokenizer)
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv')
    trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1)
    trn_df['is_original'] = 1
    # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv')

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold > 0:
            break
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ', logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            trn_df = trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(list(itertools.chain.from_iterable(
            fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
            fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
            fold_trn_df.answer.apply(lambda x: x.split(' '))
        ))).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]#  + additional_tokens
        fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1)

        # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0)

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='</s>',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            use_category=False,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        model = RobertaForMaskedLM.from_pretrained(MODEL_PRETRAIN)

        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue

            # model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [trn_loss, ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(trn_loss)
            else:
                histories['val_loss'][fold] = [trn_loss, ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(trn_loss)
            else:
                histories['val_metric'][fold] = [trn_loss, ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(trn_loss)
            else:
                histories['val_metric_raws'][fold] = [trn_loss, ]

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ',
                logger)
            model = model.to('cpu')
            # model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                [],
                [],
                [],
                fold,
                epoch,
                trn_loss,
                trn_loss,
                )
        save_and_clean_for_prediction(
            f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
            trn_dataset.tokenizer,
            clean=False)
        del model

    send_line_notification('fini!')

    sel_log('now saving best checkpoints...', logger)
Example #8
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl')
    # aug_df['is_original'] = 0

    # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True)

    # convert target to labels
    target_cols = ['question_asker_intent_understanding',
                   'question_body_critical',
                   'question_conversational',
                   'question_expect_short_answer',
                   'question_fact_seeking',
                   'question_has_commonly_accepted_answer',
                   'question_interestingness_others',
                   'question_interestingness_self',
                   'question_multi_intent',
                   'question_not_really_a_question',
                   'question_opinion_seeking',
                   'question_type_choice',
                   'question_type_compare',
                   'question_type_consequence',
                   'question_type_definition',
                   'question_type_entity',
                   'question_type_instructions',
                   'question_type_procedure',
                   'question_type_reason_explanation',
                   'question_type_spelling',
                   'question_well_written',
                   'answer_helpful',
                   'answer_level_of_information',
                   'answer_plausible',
                   'answer_relevance',
                   'answer_satisfaction',
                   'answer_type_instructions',
                   'answer_type_procedure',
                   'answer_type_reason_explanation',
                   'answer_well_written']
    for target_col in target_cols:
        map_dict = trn_df[target_col]\
            .drop_duplicates()\
            .sort_values()\
            .reset_index(drop=True)\
            .reset_index()\
            .set_index(target_col)\
            .to_dict()['index']
        trn_df.loc[:, target_col] = trn_df[target_col].map(map_dict).values

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    # calc max_seq_len using quest dataset
    # max_seq_len = QUESTDataset(
    #     df=trn_df,
    #     mode='train',
    #     tokens=[],
    #     augment=[],
    #     pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
    # ).MAX_SEQUENCE_LENGTH
    # max_seq_len = 9458
    # max_seq_len = 1504
    max_seq_len = 512

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ', logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(list(itertools.chain.from_iterable(
            fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
            fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
            fold_trn_df.answer.apply(lambda x: x.split(' '))
        ))).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        # fobj = BCEWithLogitsLoss()
        # fobj = MSELoss()
        fobj = CrossEntropyLoss()
        model = BertModelForBinaryMultiLabelClassifier(num_labels=30,
                                                       pretrained_model_name_or_path=MODEL_PRETRAIN,
                                                       # cat_num=5,
                                                       token_size=len(
                                                           trn_dataset.tokenizer),
                                                       MAX_SEQUENCE_LENGTH=max_seq_len,
                                                       )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [trn_loss, ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [val_loss, ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [val_metric, ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [val_metric_raws, ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}',
                logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
        save_and_clean_for_prediction(
            f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
            trn_dataset.tokenizer)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
def run(name, feats, params, fit_params, fill=-9999):
    logger = getLogger(name)
    logger.setLevel(DEBUG)

    ch = StreamHandler()
    ch.setLevel(DEBUG)

    handler = StreamHandler()
    handler.setLevel(DEBUG)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    train = pd.read_feather(str(TRAIN))

    with timer('load datasets'):
        X_train, y_train, X_test, cv = load_dataset(feats)
        print('train:', X_train.shape)
        print('test :', X_test.shape)

    with timer('clean datasets'):
        # drop id cols
        id_cols = X_train.filter(regex='(SK_ID_CURR|SK_ID_PREV)').columns
        print('drop id:', id_cols.tolist())
        X_train.drop(id_cols, axis=1, inplace=True)
        X_test.drop(id_cols, axis=1, inplace=True)

        # drop columns which contains many NaN
        ref_train = X_train.isnull().mean() > 0.95
        ref_test = X_test.isnull().mean() > 0.95
        nan_cols = X_train.columns[ref_train | ref_test]
        print('drop many nan:', nan_cols.tolist())
        X_train.drop(nan_cols, axis=1, inplace=True)
        X_test.drop(nan_cols, axis=1, inplace=True)

        print('train:', X_train.shape)
        print('test :', X_test.shape)

    with timer('impute missing'):
        if fill == 'mean':
            assert X_train.mean().isnull().sum() == 0
            print('fill nan with mean')
            X_train.fillna(X_train.mean(), inplace=True)
            X_test.fillna(X_train.mean(), inplace=True)
        else:
            print(f'fill nan with {fill}')
            X_train.fillna(fill, inplace=True)
            X_test.fillna(fill, inplace=True)
        assert X_train.isnull().sum().sum() == 0
        assert X_test.isnull().sum().sum() == 0

    if 'colsample_bytree' in params and params['colsample_bytree'] == 'auto':
        n_samples = X_train.shape[1]
        params['colsample_bytree'] = np.sqrt(n_samples) / n_samples
        print(f'set colsample_bytree = {params["colsample_bytree"]}')

    with timer('training'):
        cv_results = []
        cv_df = pd.DataFrame(index=range(len(y_train)),
                             columns=range(cv.get_n_splits()))
        test_df = pd.DataFrame()
        feat_df = None
        for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
            X_trn = X_train.iloc[trn_idx].copy()
            y_trn = y_train[trn_idx]
            X_val = X_train.iloc[val_idx].copy()
            y_val = y_train[val_idx]
            X_tst = X_test.copy()
            print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)

            with timer('target encoding'):
                cat_cols = X_trn.select_dtypes(['object']).columns.tolist()
                te = TargetEncoder(cols=cat_cols)
                X_trn.loc[:,
                          cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols],
                                                       y_trn)
                X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols])
                X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols])

            with timer('fit'):
                model = lgb.LGBMClassifier(**params)
                model.fit(X_trn,
                          y_trn,
                          eval_set=[(X_val, y_val)],
                          **fit_params)

            p = model.predict_proba(X_val)[:, 1]
            cv_df.loc[val_idx, i] = p
            cv_results.append(roc_auc_score(y_val, p))
            test_df[i] = model.predict_proba(X_tst)[:, 1]
            if feat_df is None:
                feat_df = pd.DataFrame(index=X_trn.columns)
            feat_df[i] = model.feature_importances_

    valid_score = np.mean(cv_results)
    message = f"""cv: {valid_score:.5f}
scores: {[round(c, 4) for c in cv_results]}
feats: {feats}
model_params: {params}
fit_params: {fit_params}"""

    send_line_notification(message)

    with timer('output results'):
        RESULT_DIR = OUTPUT / (timestamp() + '_' + name)
        RESULT_DIR.mkdir()

        # output cv prediction
        tmp = pd.DataFrame({
            'SK_ID_CURR': train['SK_ID_CURR'],
            'TARGET': cv_df.mean(axis=1)
        })
        tmp.to_csv(RESULT_DIR / f'{name}_cv.csv', index=None)

        # output test prediction
        pred = test_df.mean(axis=1).ravel()
        generate_submit(pred,
                        f'{name}_{valid_score:.5f}',
                        RESULT_DIR,
                        compression=False)

        # output feature importances
        feat_df = (feat_df / feat_df.mean(axis=0)) * 100
        feat_df.mean(axis=1).sort_values(ascending=False).to_csv(RESULT_DIR /
                                                                 'feats.csv')
        imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50]
        imp[::-1].plot.barh(figsize=(20, 15))
        plt.savefig(str(RESULT_DIR / 'feature_importances.pdf'),
                    bbox_inches='tight')

        print('=' * 60)
        print(message)
        print('=' * 60)