def build_dataloaders(self): args = self.args model_config = self.model_config is_mlm = self.model_config.auxiliary_task == 'MLM' gb = lambda batch: datasets.generate_batch(batch, self.vocab, self. persona_vocab, is_mlm) dp = datasets.ChatDataProcesser( limit_length=args.limit_example_length, max_seq_length=model_config.max_seq_length, max_context_size=model_config.max_context_size, vocab=self.vocab, persona_vocab=self.persona_vocab, tokenizer=self.tokenizer) ds = utils.PersonaDataset(self.vocab, model_config.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='test_char') self.test_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=False)
def build_dataloaders(self): args = self.args gb = lambda batch: datasets.generate_batch(batch, self.pad_idx) if args.n_epochs_early_stage > 0: ds = datasets.PersonaDataset( self.vocab, args.max_seq_length, data_path=args.data_path, cache_path=args.cache_path, limit_length=args.limit_example_length, mode='early_stage_train') self.early_stage_train_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=args.shuffle_data) ds = datasets.PersonaDataset( self.vocab, args.max_seq_length, data_path=args.data_path, cache_path=args.cache_path, limit_length=args.limit_example_length, mode='train') self.train_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=args.shuffle_data)
def build_dataloaders(self): args = self.args model_config = self.model_config gb = lambda batch: datasets.generate_batch(batch, self.pad_idx) dp = datasets.ChatDataProcesser( limit_length=args.limit_example_length, max_seq_length=args.max_seq_length, max_context_size=model_config.max_context_size) ds = utils.PersonaDataset(self.vocab, model_config.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='test_char') self.test_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=False)
def build_dataloaders(self): args = self.args gb = lambda batch: datasets.generate_batch(batch, self.pad_idx) gb_lm = lambda batch: datasets.generate_lm_batch(batch, self.pad_idx) if args.n_epochs_early_stage > 0: dp = datasets.LMDataProcesser( limit_length=args.limit_example_length, max_seq_length=args.max_seq_length) ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='train_lm') self.train_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb_lm, shuffle=True) else: dp = datasets.ChatDataProcesser( limit_length=args.limit_example_length, max_seq_length=args.max_seq_length, max_context_size=args.max_context_size) ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='train_char') self.train_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=args.shuffle_data) self.valid_iter = None self.test_iter = None ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='valid_char') self.valid_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=args.shuffle_data) ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='test_char') self.test_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=args.shuffle_data)
def build_dataloaders(self): args = self.args is_mlm = self.args.auxiliary_task == 'MLM' gb = lambda batch: datasets.generate_batch(batch, self.vocab, self. persona_vocab, is_mlm) gb_lm = lambda batch: datasets.generate_lm_batch( batch, self.vocab, is_mlm) if args.n_epochs_early_stage > 0: dp = datasets.LMDataProcesser( limit_length=args.limit_example_length, max_seq_length=args.max_seq_length, tokenizer=self.tokenizer) ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='train_lm') self.logger.info('---------------------------------') self.logger.info('datasets len: %s' % len(ds)) self.train_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb_lm, shuffle=True) else: dp = datasets.ChatDataProcesser( limit_length=args.limit_example_length, max_seq_length=args.max_seq_length, max_context_size=args.max_context_size, vocab=self.vocab, persona_vocab=self.persona_vocab, tokenizer=self.tokenizer) ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='train_char') self.logger.info('---------------------------------') self.logger.info('datasets len: %s' % len(ds)) # when Dataset is stream, try utils.DataLoaderX (prefetch_generator), https://github.com/IgorSusmelj/pytorch-styleguide/issues/5 self.train_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=args.shuffle_data) dp = datasets.ChatDataProcesser( limit_length=args.limit_example_length, max_seq_length=args.max_seq_length, max_context_size=args.max_context_size, vocab=self.vocab, persona_vocab=self.persona_vocab, tokenizer=self.tokenizer) ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='valid_char') self.valid_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=False) dp = datasets.ChatDataProcesser( limit_length=args.limit_example_length, max_seq_length=args.max_seq_length, max_context_size=args.max_context_size, vocab=self.vocab, persona_vocab=self.persona_vocab, tokenizer=self.tokenizer) ds = utils.PersonaDataset(self.vocab, args.max_seq_length, args.limit_example_length, data_path=args.data_path, cache_path=args.cache_path, data_processer=dp, mode='test_char') self.test_iter = DataLoader(ds, batch_size=args.batch_size, collate_fn=gb, shuffle=False)