Beispiel #1
0
seed_everything(args.seed)
# load the data
train_df = pd.read_csv(os.path.join(args.data_path, "train.csv"))
test_df = pd.read_csv(os.path.join(args.data_path, "test.csv"))
submission = pd.read_csv(os.path.join(args.data_path, "sample_submission.csv"))

if args.model_type == "bert":
    tokenizer = BertTokenizer.from_pretrained(
        args.bert_model, do_lower_case=("uncased" in args.bert_model))
elif args.model_type == "roberta":
    tokenizer = RobertaTokenizer.from_pretrained(args.bert_model)

test_set = get_test_set(args, test_df, tokenizer)
test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False)

for fold, train_set, valid_set, train_fold_df, val_fold_df in cross_validation_split(
        args, train_df, tokenizer):

    print()
    print("Fold:", fold)
    print()

    valid_loader = DataLoader(valid_set,
                              batch_size=args.batch_size,
                              shuffle=False,
                              drop_last=False)

    fold_checkpoints = os.path.join(experiment.checkpoints,
                                    "fold{}".format(fold))
    fold_predictions = os.path.join(experiment.predictions,
                                    "fold{}".format(fold))
Beispiel #2
0
        batch_size=original_args.batch_size,
        maxlen=original_args.max_sequence_length,
    ),
    collate_fn=make_collate_fn(),
)

val_dfs = []

for (
        fold,
        train_set,
        valid_set,
        train_fold_df,
        val_fold_df,
) in cross_validation_split(original_args,
                            train_df,
                            tokenizer,
                            ignore_train=True):

    print()
    print("Fold:", fold)
    print()

    valid_loader = DataLoader(
        valid_set,
        batch_sampler=BucketingSampler(
            valid_set.lengths,
            batch_size=original_args.batch_size,
            maxlen=original_args.max_sequence_length,
        ),
        collate_fn=make_collate_fn(),
    )
Beispiel #3
0
        pseudo_df = pd.read_csv(args.pseudo_file)
else:
    pseudo_df = None

tokenizer = BARTTokenizer.from_pretrained(args.bert_model)

test_set = get_test_set(args, test_df, tokenizer)
test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False)

best_val_dfs = []

for fold, (train_set, valid_set, train_fold_df, val_fold_df) in enumerate(
        cross_validation_split(
            args,
            train_df,
            tokenizer,
            pseudo_df=pseudo_df,
            split_pseudo=args.split_pseudo,
        )):

    print()
    print("Fold:", fold)
    print()

    train_loader = DataLoader(
        train_set,
        batch_size=args.batch_size,
        num_workers=args.workers,
        drop_last=True,
        shuffle=True,
    )
                                                         in args.bert_model))

test_dataset = get_test_dataset(args, test_df, tokenizer)

test_loader = DataLoader(
    test_dataset,
    batch_sampler=
    BucketingSampler(  # for 优化, sigma(per_sample_length) = batch size * max_length
        test_dataset.lengths,
        batch_size=args.batch_size,
        maxlen=args.max_sequence_length),
    collate_fn=make_collate_fn(),
)

for fold, train_dataset, valid_dataset, train_fold_df, val_fold_df in (
        cross_validation_split(args, train_df, tokenizer)):

    print()
    print("Fold:", fold)
    print()

    valid_loader = DataLoader(
        valid_dataset,
        batch_sampler=BucketingSampler(valid_dataset.lengths,
                                       batch_size=args.batch_size,
                                       maxlen=args.max_sequence_length),
        collate_fn=make_collate_fn(),
    )

    # 文件输出配置
    fold_checkpoints = os.path.join(args.checkpoints_path,