def loadData(args): ''' ''' __SequenceDataset = data.CharSequence if args.chars else data.TokenSequence print(__SequenceDataset.__name__) index = Index(initwords = ['<unk>'], unkindex = 0) train_ = __SequenceDataset(args.data, subset='train.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device) index.freeze(silent = True).tofile(os.path.join(args.data, 'vocab_chars.txt' if args.chars else 'vocab_tokens.txt')) test_ = __SequenceDataset(args.data, subset='test.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device) valid_ = __SequenceDataset(args.data, subset='valid.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device) # load pre embedding if args.init_weights: # determine type of embedding by checking it's suffix if args.init_weights.endswith('bin'): preemb = FastTextEmbedding(args.init_weights, normalize = True).load() if args.emsize != preemb.dim(): raise ValueError('emsize must match embedding size. Expected %d but got %d)' % (args.emsize, preemb.dim())) elif args.init_weights.endswith('txt'): preemb = TextEmbedding(args.init_weights, vectordim = args.emsize).load(normalize = True) elif args.init_weights.endswith('rand'): preemb = RandomEmbedding(vectordim = args.emsize) else: raise ValueError('Type of embedding cannot be inferred.') preemb = Embedding.filteredEmbedding(index.vocabulary(), preemb, fillmissing = True) preemb_weights = torch.Tensor(preemb.weights) else: preemb_weights = None eval_batch_size = 10 __ItemSampler = RandomSampler if args.shuffle_samples else SequentialSampler __BatchSampler = BatchSampler if args.sequential_sampling else EvenlyDistributingSampler train_loader = torch.utils.data.DataLoader(train_, batch_sampler = ShufflingBatchSampler(__BatchSampler(__ItemSampler(train_), batch_size=args.batch_size, drop_last = True), shuffle = args.shuffle_batches, seed = args.seed), num_workers = 0) test_loader = torch.utils.data.DataLoader(test_, batch_sampler = __BatchSampler(__ItemSampler(test_), batch_size=eval_batch_size, drop_last = True), num_workers = 0) valid_loader = torch.utils.data.DataLoader(valid_, batch_sampler = __BatchSampler(__ItemSampler(valid_), batch_size=eval_batch_size, drop_last = True), num_workers = 0) print(__ItemSampler.__name__) print(__BatchSampler.__name__) print('Shuffle training batches: ', args.shuffle_batches) setattr(args, 'index', index) setattr(args, 'ntokens', len(index)) setattr(args, 'trainloader', train_loader) setattr(args, 'testloader', test_loader) setattr(args, 'validloader', valid_loader) setattr(args, 'preembweights', preemb_weights) setattr(args, 'eval_batch_size', eval_batch_size) return args
device = torch.device("cuda" if args.cuda else "cpu") ############################################################################### # Load data ############################################################################### __SequenceDataset = CharSequence if args.chars else TokenSequence print(__SequenceDataset.__name__) index = Index(initwords=['<unk>'], unkindex=0) train_ = __SequenceDataset(args.data, subset='train.txt', index=index, seqlen=args.bptt, skip=args.bptt).to(device) index.freeze(silent=True).tofile( os.path.join(args.data, 'vocab_chars.txt' if args.chars else 'vocab_tokens.txt')) test_ = __SequenceDataset(args.data, subset='test.txt', index=index, seqlen=args.bptt, skip=args.bptt).to(device) valid_ = __SequenceDataset(args.data, subset='valid.txt', index=index, seqlen=args.bptt, skip=args.bptt).to(device) # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐