submission = pd.read_csv( os.path.join( args.data_path, "sample_submission_toy.csv" if args.toy in ["True", "toy"] else "sample_submission.csv", )) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=("uncased" in args.bert_model)) test_set = get_test_set(args, test_df, tokenizer) test_loader = DataLoader( test_set, batch_sampler=BucketingSampler(test_set.lengths, batch_size=args.batch_size, maxlen=args.max_sequence_length), collate_fn=make_collate_fn(), ) for fold, train_set, valid_set, train_fold_df, val_fold_df in cross_validation_split( args, train_df, tokenizer): print() print("Fold:", fold) print() valid_loader = DataLoader( valid_set, batch_sampler=BucketingSampler( valid_set.lengths,
## load the data train_df = pd.read_csv(os.path.join(args.data_path, "train.csv")) test_df = pd.read_csv(os.path.join(args.data_path, "test.csv")) submission = pd.read_csv(os.path.join(args.data_path, "sample_submission.csv")) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=("uncased" in args.bert_model)) test_dataset = get_test_dataset(args, test_df, tokenizer) test_loader = DataLoader( test_dataset, batch_sampler= BucketingSampler( # for 优化, sigma(per_sample_length) = batch size * max_length test_dataset.lengths, batch_size=args.batch_size, maxlen=args.max_sequence_length), collate_fn=make_collate_fn(), ) for fold, train_dataset, valid_dataset, train_fold_df, val_fold_df in ( cross_validation_split(args, train_df, tokenizer)): print() print("Fold:", fold) print() valid_loader = DataLoader( valid_dataset, batch_sampler=BucketingSampler(valid_dataset.lengths, batch_size=args.batch_size,
torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) SOS_token = 70 EOS_token = 71 PAD_token = 0 device = torch.device('cuda' if args.cuda else 'cpu') batch_size = args.batch_size train_dataset = SpectrogramDataset(dataset_path=args.dataset_path, data_list=args.rootpath + "train_list.csv") train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) train_loader = AudioDataLoader(train_dataset, num_workers=4, batch_sampler=train_sampler) test_dataset = SpectrogramDataset(dataset_path=args.dataset_path, data_list=args.rootpath + "valid_list.csv") test_loader = AudioDataLoader(test_dataset, num_workers=4, batch_size=1) input_size = 80 enc = EncoderRNN(input_size, args.encoder_size, n_layers=args.encoder_layers, dropout_p=args.dropout_rate, bidirectional=args.bidirectional,
max_question_length=getattr(config, "max_question_length", 260), max_answer_length=getattr(config, "max_answer_length", 210), head_tail=getattr(config, "head_tail", True), use_folds=None, ) tokenizer = BertTokenizer.from_pretrained( original_args.bert_model, do_lower_case=("uncased" in original_args.bert_model)) test_set = get_test_set(original_args, test_df, tokenizer) test_loader = DataLoader( test_set, batch_sampler=BucketingSampler( test_set.lengths, batch_size=original_args.batch_size, maxlen=original_args.max_sequence_length, ), collate_fn=make_collate_fn(), ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for fold in range(config.folds): print() print("Fold:", fold) print() fold_checkpoints = os.path.join(experiment.checkpoints,