コード例 #1
0
def test_it(MODEL_PATH='roberta-base'):
    models = []
    for t in os.listdir('type'):
        for model_file in os.listdir(os.path.join('type', t)):
            model = TweetModel(MODEL_PATH=t)
            # model.cuda()
            model.cpu()
            model.load_state_dict(
                torch.load(os.path.join(os.path.join('type', t), model_file)))
            model.eval()
            models.append(model)

    test_df = pd.read_csv('data/test.csv')
    test_df['text'] = test_df['text'].astype(str)
    test_loader = get_test_loader(test_df, MODEL_PATH=MODEL_PATH)
    predictions = []

    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['tweet']
        offsets = data['offsets'].numpy()
        sentiment = data['sentiment']

        start_logits = []
        end_logits = []
        # len_logits = []
        for model in models:
            with torch.no_grad():
                model.cuda()
                output = model(ids, masks)
                start_logits.append(
                    torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(
                    torch.softmax(output[1], dim=1).cpu().detach().numpy())
                # len_logits.append(torch.softmax(output[2], dim=1).cpu().detach().numpy())
                model.cpu()
        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)
        # len_logits = np.mean(len_logits, axis=0)
        for i in range(len(ids)):
            start_pred = np.argmax(start_logits[i])
            end_pred = np.argmax(end_logits[i])
            # length = np.argmax(len_logits[i])
            # end_pred = start_pred + int(length)
            sentiment_val = sentiment[i]
            original_tweet = tweet[i]
            if start_pred > end_pred:
                pred = original_tweet
            else:
                pred = get_selected_text(tweet[i], start_pred, end_pred,
                                         offsets[i])
            if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
                pred = original_tweet
            predictions.append(pred)

    sub_df = pd.read_csv('data/sample_submission.csv')
    sub_df['selected_text'] = predictions
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('..', '.') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('...', '.') if len(x.split()) == 1 else x)
    sub_df.to_csv('submission.csv', index=False)
    sub_df.head()
コード例 #2
0
ファイル: main.py プロジェクト: sangwon79/nlp_final_project
def main():
    seed = 42
    seed_everything(seed)

    num_epochs = 3
    batch_size = 32
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

    train_df = pd.read_csv('data/train.csv')
    train_df['text'] = train_df['text'].astype(str)
    train_df['selected_text'] = train_df['selected_text'].astype(str)

    for fold, (train_idx,
               val_idx) in enumerate(skf.split(train_df, train_df.sentiment),
                                     start=1):
        print(f'Fold: {fold}')

        model = TweetModel()
        optimizer = optim.AdamW(model.parameters(),
                                lr=3e-5,
                                betas=(0.9, 0.999))
        criterion = loss_fn
        dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx,
                                                 batch_size)

        train_model(model, dataloaders_dict, criterion, optimizer, num_epochs,
                    f'roberta_fold{fold}.pth')

    # inference

    test_df = pd.read_csv('data/test.csv')
    test_df['text'] = test_df['text'].astype(str)
    test_loader = get_test_loader(test_df)
    predictions = []
    models = []

    for fold in range(skf.n_splits):
        model = TweetModel()
        model.cuda()
        model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth'))
        model.eval()
        models.append(model)

    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['tweet']
        offsets = data['offsets'].numpy()

        start_logits = []
        end_logits = []
        for model in models:
            with torch.no_grad():
                output = model(ids, masks)
                start_logits.append(
                    torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(
                    torch.softmax(output[1], dim=1).cpu().detach().numpy())

        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)
        for i in range(len(ids)):
            start_pred = np.argmax(start_logits[i])
            end_pred = np.argmax(end_logits[i])
            if start_pred > end_pred:
                pred = tweet[i]
            else:
                pred = get_selected_text(tweet[i], start_pred, end_pred,
                                         offsets[i])
            predictions.append(pred)

    #submission

    sub_df = pd.read_csv('data/sample_submission.csv')
    sub_df['selected_text'] = predictions
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('..', '.') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('...', '.') if len(x.split()) == 1 else x)
    sub_df.to_csv('submission.csv', index=False)
    sub_df.head()
コード例 #3
0
ファイル: test.py プロジェクト: zelcookie/madmo_sber
def predict(df_test):
    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model1 = TweetModel(conf=model_config)
    model1.to(device)
    model1.load_state_dict(torch.load("model_0.bin"))
    model1.eval()

    model2 = TweetModel(conf=model_config)
    model2.to(device)
    model2.load_state_dict(torch.load("model_1.bin"))
    model2.eval()

    model3 = TweetModel(conf=model_config)
    model3.to(device)
    model3.load_state_dict(torch.load("model_2.bin"))
    model3.eval()

    model4 = TweetModel(conf=model_config)
    model4.to(device)
    model4.load_state_dict(torch.load("model_3.bin"))
    model4.eval()

    model5 = TweetModel(conf=model_config)
    model5.to(device)
    model5.load_state_dict(torch.load("model_4.bin"))
    model5.eval()

    final_output = []

    test_dataset = TweetDataset(
            tweet=df_test.text.values,
            sentiment=df_test.sentiment.values,
            selected_text=df_test.selected_text.values
    )

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )
    jaccards = utils.AverageMeter()
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start1, outputs_end1 = model1(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start2, outputs_end2 = model2(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start3, outputs_end3 = model3(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start4, outputs_end4 = model4(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start5, outputs_end5 = model5(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            outputs_start = (
                outputs_start1 
                + outputs_start2 
                + outputs_start3 
                + outputs_start4 
                + outputs_start5
            ) / 5
            outputs_end = (
                outputs_end1 
                + outputs_end2 
                + outputs_end3 
                + outputs_end4 
                + outputs_end5
            ) / 5
            
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, output_sentence = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)
                final_output.append(output_sentence)
            jaccards.update(np.mean(jaccard_scores), ids.size(0))
    return final_output, jaccards.avg
コード例 #4
0
def main(args, mode):
    config = Config(
        train_dir='/mfs/renxiangyuan/tweets/data/train_folds.csv',  # 原始数据
        # train_dir='/mfs/renxiangyuan/tweets/data/train_folds_extra.csv',  # 加入更多sentimen分类数据
        model_save_dir=
        f'/mfs/renxiangyuan/tweets/output/{args.model_type}-5-fold-ak',
        # model_save_dir=f'/mfs/renxiangyuan/tweets/output/shuffle/{args.model_type}-5-fold-ak',
        model_type=args.model_type,
        batch_size=args.bs,
        seed=args.seed,
        lr=args.lr * 1e-5,
        max_seq_length=args.max_seq_length,
        num_hidden_layers=args.num_hidden_layers,
        cat_n_layers=args.cat_n_layers,
        froze_n_layers=args.froze_n_layers,

        # conv_head=True,
        # eps=args.eps,
        shuffle_seed=args.shuffle_seed,
        init_seed=args.init_seed,
        epochs=args.epochs,  # 默认epochs=3
        warmup_samples=args.warmup_samples,
        # frozen_warmup=False,
        warmup_scheduler=args.scheduler,
        mask_pad_loss=args.mask_pad_loss,
        smooth=args.smooth,
        # fp16=False,
        io_loss_ratio=args.io_loss_ratio,
        io_loss_type=args.io_loss_type,
        # multi_sent_loss_ratio=0,
        # clean_data=True,  # 模型clean_data=False
    )

    config.print_info()

    set_seed(config.seed)

    # 训练
    if "train" in mode:
        os.makedirs(config.MODEL_SAVE_DIR, exist_ok=True)
        jaccard_scores = []
        for i in args.train_folds:
            scores_i = train(fold=i, config=config)
            jaccard_scores.append(scores_i)
            # if i == 0 and max(scores_i) < 0.705:
            #     print("Fold 0 Too Weak, Early Stop")
            #     break
        for i, res_i in enumerate(jaccard_scores):
            print(i, res_i)
        print("mean", np.mean([max(scores) for scores in jaccard_scores]))
        for i in range(1, config.EPOCHS):
            print(f"\tEpoch{i+1}: ",
                  np.mean([scores[i] for scores in jaccard_scores]))
        config.print_info()

    # 测试
    if "test" in mode:
        model_paths = [
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_11shufflesd/model_0_epoch_2.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_3shufflesd/model_1_epoch_3.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_18shufflesd/model_2_epoch_3.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_13shufflesd/model_3_epoch_2.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_19shufflesd/model_4_epoch_3.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_0_epoch_2.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_1_epoch_2.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_2_epoch_3.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_3_epoch_3.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_4_epoch_3.pth",
        ]
        ensemble_infer(model_paths, config)
        # ensemble_infer(model_paths=None, config=config)

    # # 评估
    if "evaluate" in mode:
        device = torch.device("cuda")
        model = TweetModel(conf=config.model_config, config=config)
        model.to(device)
        res = [[] for _ in range(5)]
        for fold in range(5):
            dfx = pd.read_csv(config.TRAINING_FILE)
            df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

            valid_dataset = TweetDataset(
                tweet=df_valid.text.values,
                sentiment=df_valid.sentiment.values,
                selected_text=df_valid.selected_text.values,
                config=config,
            )

            valid_data_loader = DataLoader(valid_dataset,
                                           batch_size=config.VALID_BATCH_SIZE,
                                           num_workers=8)

            for ep in range(1, config.EPOCHS):
                state_dict_dir = os.path.join(
                    config.MODEL_SAVE_DIR, f"model_{fold}_epoch_{ep+1}.pth")
                print(state_dict_dir)
                model.load_state_dict(torch.load(state_dict_dir))
                model.eval()

                jaccards = eval_fn(valid_data_loader, model, device, config)
                print(jaccards)
                res[fold].append(jaccards)

        for i, res_i in enumerate(res):
            print(i, res_i)
        print("mean", np.mean([max(scores) for scores in res]))

        for i in range(2):
            print(f"\tEpoch{i + 1}: ", np.mean([scores[i] for scores in res]))