Esempio n. 1
0
submission = pd.read_csv(
    os.path.join(
        args.data_path,
        "sample_submission_toy.csv"
        if args.toy in ["True", "toy"] else "sample_submission.csv",
    ))

tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                          do_lower_case=("uncased"
                                                         in args.bert_model))

test_set = get_test_set(args, test_df, tokenizer)
test_loader = DataLoader(
    test_set,
    batch_sampler=BucketingSampler(test_set.lengths,
                                   batch_size=args.batch_size,
                                   maxlen=args.max_sequence_length),
    collate_fn=make_collate_fn(),
)

for fold, train_set, valid_set, train_fold_df, val_fold_df in cross_validation_split(
        args, train_df, tokenizer):

    print()
    print("Fold:", fold)
    print()

    valid_loader = DataLoader(
        valid_set,
        batch_sampler=BucketingSampler(
            valid_set.lengths,
## load the data
train_df = pd.read_csv(os.path.join(args.data_path, "train.csv"))
test_df = pd.read_csv(os.path.join(args.data_path, "test.csv"))
submission = pd.read_csv(os.path.join(args.data_path, "sample_submission.csv"))

tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                          do_lower_case=("uncased"
                                                         in args.bert_model))

test_dataset = get_test_dataset(args, test_df, tokenizer)

test_loader = DataLoader(
    test_dataset,
    batch_sampler=
    BucketingSampler(  # for 优化, sigma(per_sample_length) = batch size * max_length
        test_dataset.lengths,
        batch_size=args.batch_size,
        maxlen=args.max_sequence_length),
    collate_fn=make_collate_fn(),
)

for fold, train_dataset, valid_dataset, train_fold_df, val_fold_df in (
        cross_validation_split(args, train_df, tokenizer)):

    print()
    print("Fold:", fold)
    print()

    valid_loader = DataLoader(
        valid_dataset,
        batch_sampler=BucketingSampler(valid_dataset.lengths,
                                       batch_size=args.batch_size,
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    SOS_token = 70
    EOS_token = 71
    PAD_token = 0

    device = torch.device('cuda' if args.cuda else 'cpu')

    batch_size = args.batch_size

    train_dataset = SpectrogramDataset(dataset_path=args.dataset_path,
                                       data_list=args.rootpath +
                                       "train_list.csv")
    train_sampler = BucketingSampler(train_dataset, batch_size=batch_size)
    train_loader = AudioDataLoader(train_dataset,
                                   num_workers=4,
                                   batch_sampler=train_sampler)

    test_dataset = SpectrogramDataset(dataset_path=args.dataset_path,
                                      data_list=args.rootpath +
                                      "valid_list.csv")
    test_loader = AudioDataLoader(test_dataset, num_workers=4, batch_size=1)

    input_size = 80
    enc = EncoderRNN(input_size,
                     args.encoder_size,
                     n_layers=args.encoder_layers,
                     dropout_p=args.dropout_rate,
                     bidirectional=args.bidirectional,
Esempio n. 4
0
    max_question_length=getattr(config, "max_question_length", 260),
    max_answer_length=getattr(config, "max_answer_length", 210),
    head_tail=getattr(config, "head_tail", True),
    use_folds=None,
)

tokenizer = BertTokenizer.from_pretrained(
    original_args.bert_model,
    do_lower_case=("uncased" in original_args.bert_model))

test_set = get_test_set(original_args, test_df, tokenizer)
test_loader = DataLoader(
    test_set,
    batch_sampler=BucketingSampler(
        test_set.lengths,
        batch_size=original_args.batch_size,
        maxlen=original_args.max_sequence_length,
    ),
    collate_fn=make_collate_fn(),
)

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

for fold in range(config.folds):

    print()
    print("Fold:", fold)
    print()

    fold_checkpoints = os.path.join(experiment.checkpoints,