def test_it(MODEL_PATH='roberta-base'): models = [] for t in os.listdir('type'): for model_file in os.listdir(os.path.join('type', t)): model = TweetModel(MODEL_PATH=t) # model.cuda() model.cpu() model.load_state_dict( torch.load(os.path.join(os.path.join('type', t), model_file))) model.eval() models.append(model) test_df = pd.read_csv('data/test.csv') test_df['text'] = test_df['text'].astype(str) test_loader = get_test_loader(test_df, MODEL_PATH=MODEL_PATH) predictions = [] for data in test_loader: ids = data['ids'].cuda() masks = data['masks'].cuda() tweet = data['tweet'] offsets = data['offsets'].numpy() sentiment = data['sentiment'] start_logits = [] end_logits = [] # len_logits = [] for model in models: with torch.no_grad(): model.cuda() output = model(ids, masks) start_logits.append( torch.softmax(output[0], dim=1).cpu().detach().numpy()) end_logits.append( torch.softmax(output[1], dim=1).cpu().detach().numpy()) # len_logits.append(torch.softmax(output[2], dim=1).cpu().detach().numpy()) model.cpu() start_logits = np.mean(start_logits, axis=0) end_logits = np.mean(end_logits, axis=0) # len_logits = np.mean(len_logits, axis=0) for i in range(len(ids)): start_pred = np.argmax(start_logits[i]) end_pred = np.argmax(end_logits[i]) # length = np.argmax(len_logits[i]) # end_pred = start_pred + int(length) sentiment_val = sentiment[i] original_tweet = tweet[i] if start_pred > end_pred: pred = original_tweet else: pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i]) if sentiment_val == "neutral" or len(original_tweet.split()) < 2: pred = original_tweet predictions.append(pred) sub_df = pd.read_csv('data/sample_submission.csv') sub_df['selected_text'] = predictions sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('..', '.') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('...', '.') if len(x.split()) == 1 else x) sub_df.to_csv('submission.csv', index=False) sub_df.head()
def main(): seed = 42 seed_everything(seed) num_epochs = 3 batch_size = 32 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) train_df = pd.read_csv('data/train.csv') train_df['text'] = train_df['text'].astype(str) train_df['selected_text'] = train_df['selected_text'].astype(str) for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): print(f'Fold: {fold}') model = TweetModel() optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999)) criterion = loss_fn dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size) train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, f'roberta_fold{fold}.pth') # inference test_df = pd.read_csv('data/test.csv') test_df['text'] = test_df['text'].astype(str) test_loader = get_test_loader(test_df) predictions = [] models = [] for fold in range(skf.n_splits): model = TweetModel() model.cuda() model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth')) model.eval() models.append(model) for data in test_loader: ids = data['ids'].cuda() masks = data['masks'].cuda() tweet = data['tweet'] offsets = data['offsets'].numpy() start_logits = [] end_logits = [] for model in models: with torch.no_grad(): output = model(ids, masks) start_logits.append( torch.softmax(output[0], dim=1).cpu().detach().numpy()) end_logits.append( torch.softmax(output[1], dim=1).cpu().detach().numpy()) start_logits = np.mean(start_logits, axis=0) end_logits = np.mean(end_logits, axis=0) for i in range(len(ids)): start_pred = np.argmax(start_logits[i]) end_pred = np.argmax(end_logits[i]) if start_pred > end_pred: pred = tweet[i] else: pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i]) predictions.append(pred) #submission sub_df = pd.read_csv('data/sample_submission.csv') sub_df['selected_text'] = predictions sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('..', '.') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('...', '.') if len(x.split()) == 1 else x) sub_df.to_csv('submission.csv', index=False) sub_df.head()
def predict(df_test): device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model1 = TweetModel(conf=model_config) model1.to(device) model1.load_state_dict(torch.load("model_0.bin")) model1.eval() model2 = TweetModel(conf=model_config) model2.to(device) model2.load_state_dict(torch.load("model_1.bin")) model2.eval() model3 = TweetModel(conf=model_config) model3.to(device) model3.load_state_dict(torch.load("model_2.bin")) model3.eval() model4 = TweetModel(conf=model_config) model4.to(device) model4.load_state_dict(torch.load("model_3.bin")) model4.eval() model5 = TweetModel(conf=model_config) model5.to(device) model5.load_state_dict(torch.load("model_4.bin")) model5.eval() final_output = [] test_dataset = TweetDataset( tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.selected_text.values ) data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) jaccards = utils.AverageMeter() with torch.no_grad(): tk0 = tqdm(data_loader, total=len(data_loader)) for bi, d in enumerate(tk0): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] sentiment = d["sentiment"] orig_selected = d["orig_selected"] orig_tweet = d["orig_tweet"] targets_start = d["targets_start"] targets_end = d["targets_end"] offsets = d["offsets"].numpy() ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.long) targets_end = targets_end.to(device, dtype=torch.long) outputs_start1, outputs_end1 = model1( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start2, outputs_end2 = model2( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start3, outputs_end3 = model3( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start4, outputs_end4 = model4( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start5, outputs_end5 = model5( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start = ( outputs_start1 + outputs_start2 + outputs_start3 + outputs_start4 + outputs_start5 ) / 5 outputs_end = ( outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 + outputs_end5 ) / 5 outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy() outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy() jaccard_scores = [] for px, tweet in enumerate(orig_tweet): selected_tweet = orig_selected[px] tweet_sentiment = sentiment[px] jaccard_score, output_sentence = calculate_jaccard_score( original_tweet=tweet, target_string=selected_tweet, sentiment_val=tweet_sentiment, idx_start=np.argmax(outputs_start[px, :]), idx_end=np.argmax(outputs_end[px, :]), offsets=offsets[px] ) jaccard_scores.append(jaccard_score) final_output.append(output_sentence) jaccards.update(np.mean(jaccard_scores), ids.size(0)) return final_output, jaccards.avg
def main(args, mode): config = Config( train_dir='/mfs/renxiangyuan/tweets/data/train_folds.csv', # 原始数据 # train_dir='/mfs/renxiangyuan/tweets/data/train_folds_extra.csv', # 加入更多sentimen分类数据 model_save_dir= f'/mfs/renxiangyuan/tweets/output/{args.model_type}-5-fold-ak', # model_save_dir=f'/mfs/renxiangyuan/tweets/output/shuffle/{args.model_type}-5-fold-ak', model_type=args.model_type, batch_size=args.bs, seed=args.seed, lr=args.lr * 1e-5, max_seq_length=args.max_seq_length, num_hidden_layers=args.num_hidden_layers, cat_n_layers=args.cat_n_layers, froze_n_layers=args.froze_n_layers, # conv_head=True, # eps=args.eps, shuffle_seed=args.shuffle_seed, init_seed=args.init_seed, epochs=args.epochs, # 默认epochs=3 warmup_samples=args.warmup_samples, # frozen_warmup=False, warmup_scheduler=args.scheduler, mask_pad_loss=args.mask_pad_loss, smooth=args.smooth, # fp16=False, io_loss_ratio=args.io_loss_ratio, io_loss_type=args.io_loss_type, # multi_sent_loss_ratio=0, # clean_data=True, # 模型clean_data=False ) config.print_info() set_seed(config.seed) # 训练 if "train" in mode: os.makedirs(config.MODEL_SAVE_DIR, exist_ok=True) jaccard_scores = [] for i in args.train_folds: scores_i = train(fold=i, config=config) jaccard_scores.append(scores_i) # if i == 0 and max(scores_i) < 0.705: # print("Fold 0 Too Weak, Early Stop") # break for i, res_i in enumerate(jaccard_scores): print(i, res_i) print("mean", np.mean([max(scores) for scores in jaccard_scores])) for i in range(1, config.EPOCHS): print(f"\tEpoch{i+1}: ", np.mean([scores[i] for scores in jaccard_scores])) config.print_info() # 测试 if "test" in mode: model_paths = [ "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_11shufflesd/model_0_epoch_2.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_3shufflesd/model_1_epoch_3.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_18shufflesd/model_2_epoch_3.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_13shufflesd/model_3_epoch_2.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_19shufflesd/model_4_epoch_3.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_0_epoch_2.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_1_epoch_2.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_2_epoch_3.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_3_epoch_3.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_4_epoch_3.pth", ] ensemble_infer(model_paths, config) # ensemble_infer(model_paths=None, config=config) # # 评估 if "evaluate" in mode: device = torch.device("cuda") model = TweetModel(conf=config.model_config, config=config) model.to(device) res = [[] for _ in range(5)] for fold in range(5): dfx = pd.read_csv(config.TRAINING_FILE) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values, config=config, ) valid_data_loader = DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=8) for ep in range(1, config.EPOCHS): state_dict_dir = os.path.join( config.MODEL_SAVE_DIR, f"model_{fold}_epoch_{ep+1}.pth") print(state_dict_dir) model.load_state_dict(torch.load(state_dict_dir)) model.eval() jaccards = eval_fn(valid_data_loader, model, device, config) print(jaccards) res[fold].append(jaccards) for i, res_i in enumerate(res): print(i, res_i) print("mean", np.mean([max(scores) for scores in res])) for i in range(2): print(f"\tEpoch{i + 1}: ", np.mean([scores[i] for scores in res]))