def notify_results(): competition_name = 'SIIM\n' model_name = submission_file comment = 'version={}'.format(args.version) message = competition_name + '\n' + model_name + '\n' + comment + '\n' send_line_notification(message=message) print(message)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode='tq_a', TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode='tq_a', TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f}' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger) if __name__ == '__main__': args = parse_args(None) log_file = f'{EXP_ID}.log' logger = getLogger(__name__) logger = logInit(logger, f'{MNT_DIR}/logs/', log_file) sel_log(f'args: {sorted(vars(args).items())}', logger) send_line_notification(f' ------------- start {EXP_ID} ------------- ') main(args, logger)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl') # aug_df['is_original'] = 0 # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True) gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: continue fold_trn_df = trn_df.iloc[trn_idx]# .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() model = BertModelForBinaryMultiLabelClassifier(num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5 ) model.resize_token_embeddings(len(trn_dataset.tokenizer)) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = model.to(DEVICE) if fold <= loaded_fold and epoch <= loaded_epoch: continue trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [val_metric, ] sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f}', logger) model = model.to('cpu') save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) fold_metric = histories["val_metric"][fold] fold_metric_mean = np.mean(fold_metric) fold_metric_std = np.std(fold_metric) fold_stats = f'{fold_metric_mean:.4f} +- {fold_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) del model sel_log('now saving best checkpoints...', logger)
test_df[i] = model.predict_proba(X_tst)[:, 1] if feat_df is None: feat_df = pd.DataFrame(index=X_trn.columns) feat_df[i] = model.feature_importances_ val_df = pd.DataFrame({ 'TARGET': y_train, 'p': val_series }).to_csv(OUTPUT / f'{NAME}_cv_pred.csv', index=False) valid_score = np.mean(cv_results) message = f"""cv: {valid_score: .5f} feats: {feats} model_params: {lgb_params} fit_params: {fit_params}""" send_line_notification(message) print('=' * 60) print(message) print('=' * 60) if rank_average: pred = test_df.apply(lambda x: rankdata(x) / len(x)).mean(axis=1).ravel() else: pred = test_df.mean(axis=1).ravel() generate_submit(pred, f'{NAME}_{valid_score:.5f}') print('output feature importances') feat_df.mean(axis=1).sort_values(ascending=False).to_csv(OUTPUT / f'{NAME}_feat.csv') imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50] imp[::-1].plot.barh(figsize=(20, 15))
def main(args, logger): trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body) histories = { 'trn_loss': [], 'val_loss': [], 'val_metric': [], } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: continue trn_qa_ids = trn_df.iloc[trn_idx].qa_id val_qa_ids = trn_df.iloc[val_idx].qa_id if args.debug: trn_qa_ids = trn_qa_ids.sample(300, random_state=71) val_qa_ids = val_qa_ids.sample(300, random_state=71) trn_dataset = QUESTDataset( mode='train', qa_ids=trn_qa_ids, augment=[], pretrained_model_name_or_path=PRETRAIN, data_path=f'{MNT_DIR}/inputs/origin/') trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( mode='valid', qa_ids=val_qa_ids, augment=[], pretrained_model_name_or_path='bert-base-uncased', data_path=f'{MNT_DIR}/inputs/origin/') val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = soft_binary_cross_entropy model = BertModelForBinaryMultiLabelClassifier(num_labels=30, pretrained_model_name_or_path=PRETRAIN ).to(DEVICE) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test( model, val_loader) scheduler.step() histories['trn_loss'].append(trn_loss) histories['val_loss'].append(val_loss) histories['val_metric'].append(val_metric) sel_log( f'{trn_loss.detach().to("cpu").numpy()} -- ' f'{val_loss.detach().to("cpu").numpy()} -- ' f'{val_metric}', logger) save_checkpoint( f'{MNT_DIR}/checkpoints', EXP_ID, model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) del model send_line_notification(f'finished fold {fold}') sel_log('now saving best checkpoints...', logger) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/e002/', val_dataset.tokenizer)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv') trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1) trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold > 0: break if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop( ['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop( ['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) trn_df = trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ]# + additional_tokens fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1) # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='</s>', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, use_category=False, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) model = RobertaForMaskedLM.from_pretrained(MODEL_PRETRAIN) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue # model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(trn_loss) else: histories['val_loss'][fold] = [trn_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(trn_loss) else: histories['val_metric'][fold] = [trn_loss, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(trn_loss) else: histories['val_metric_raws'][fold] = [trn_loss, ] sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ', logger) model = model.to('cpu') # model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, [], [], [], fold, epoch, trn_loss, trn_loss, ) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model send_line_notification('fini!') sel_log('now saving best checkpoints...', logger)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl') # aug_df['is_original'] = 0 # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True) # convert target to labels target_cols = ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written'] for target_col in target_cols: map_dict = trn_df[target_col]\ .drop_duplicates()\ .sort_values()\ .reset_index(drop=True)\ .reset_index()\ .set_index(target_col)\ .to_dict()['index'] trn_df.loc[:, target_col] = trn_df[target_col].map(map_dict).values gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) # calc max_seq_len using quest dataset # max_seq_len = QUESTDataset( # df=trn_df, # mode='train', # tokens=[], # augment=[], # pretrained_model_name_or_path=TOKENIZER_PRETRAIN, # ).MAX_SEQUENCE_LENGTH # max_seq_len = 9458 # max_seq_len = 1504 max_seq_len = 512 fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop( ['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop( ['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) # fobj = BCEWithLogitsLoss() # fobj = MSELoss() fobj = CrossEntropyLoss() model = BertModelForBinaryMultiLabelClassifier(num_labels=30, pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5, token_size=len( trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=max_seq_len, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def run(name, feats, params, fit_params, fill=-9999): logger = getLogger(name) logger.setLevel(DEBUG) ch = StreamHandler() ch.setLevel(DEBUG) handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) train = pd.read_feather(str(TRAIN)) with timer('load datasets'): X_train, y_train, X_test, cv = load_dataset(feats) print('train:', X_train.shape) print('test :', X_test.shape) with timer('clean datasets'): # drop id cols id_cols = X_train.filter(regex='(SK_ID_CURR|SK_ID_PREV)').columns print('drop id:', id_cols.tolist()) X_train.drop(id_cols, axis=1, inplace=True) X_test.drop(id_cols, axis=1, inplace=True) # drop columns which contains many NaN ref_train = X_train.isnull().mean() > 0.95 ref_test = X_test.isnull().mean() > 0.95 nan_cols = X_train.columns[ref_train | ref_test] print('drop many nan:', nan_cols.tolist()) X_train.drop(nan_cols, axis=1, inplace=True) X_test.drop(nan_cols, axis=1, inplace=True) print('train:', X_train.shape) print('test :', X_test.shape) with timer('impute missing'): if fill == 'mean': assert X_train.mean().isnull().sum() == 0 print('fill nan with mean') X_train.fillna(X_train.mean(), inplace=True) X_test.fillna(X_train.mean(), inplace=True) else: print(f'fill nan with {fill}') X_train.fillna(fill, inplace=True) X_test.fillna(fill, inplace=True) assert X_train.isnull().sum().sum() == 0 assert X_test.isnull().sum().sum() == 0 if 'colsample_bytree' in params and params['colsample_bytree'] == 'auto': n_samples = X_train.shape[1] params['colsample_bytree'] = np.sqrt(n_samples) / n_samples print(f'set colsample_bytree = {params["colsample_bytree"]}') with timer('training'): cv_results = [] cv_df = pd.DataFrame(index=range(len(y_train)), columns=range(cv.get_n_splits())) test_df = pd.DataFrame() feat_df = None for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx].copy() y_trn = y_train[trn_idx] X_val = X_train.iloc[val_idx].copy() y_val = y_train[val_idx] X_tst = X_test.copy() print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) with timer('target encoding'): cat_cols = X_trn.select_dtypes(['object']).columns.tolist() te = TargetEncoder(cols=cat_cols) X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn) X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]) X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]) with timer('fit'): model = lgb.LGBMClassifier(**params) model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], **fit_params) p = model.predict_proba(X_val)[:, 1] cv_df.loc[val_idx, i] = p cv_results.append(roc_auc_score(y_val, p)) test_df[i] = model.predict_proba(X_tst)[:, 1] if feat_df is None: feat_df = pd.DataFrame(index=X_trn.columns) feat_df[i] = model.feature_importances_ valid_score = np.mean(cv_results) message = f"""cv: {valid_score:.5f} scores: {[round(c, 4) for c in cv_results]} feats: {feats} model_params: {params} fit_params: {fit_params}""" send_line_notification(message) with timer('output results'): RESULT_DIR = OUTPUT / (timestamp() + '_' + name) RESULT_DIR.mkdir() # output cv prediction tmp = pd.DataFrame({ 'SK_ID_CURR': train['SK_ID_CURR'], 'TARGET': cv_df.mean(axis=1) }) tmp.to_csv(RESULT_DIR / f'{name}_cv.csv', index=None) # output test prediction pred = test_df.mean(axis=1).ravel() generate_submit(pred, f'{name}_{valid_score:.5f}', RESULT_DIR, compression=False) # output feature importances feat_df = (feat_df / feat_df.mean(axis=0)) * 100 feat_df.mean(axis=1).sort_values(ascending=False).to_csv(RESULT_DIR / 'feats.csv') imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50] imp[::-1].plot.barh(figsize=(20, 15)) plt.savefig(str(RESULT_DIR / 'feature_importances.pdf'), bbox_inches='tight') print('=' * 60) print(message) print('=' * 60)