Esempio n. 1
0
    def build_dataloaders(self):
        args = self.args
        model_config = self.model_config
        is_mlm = self.model_config.auxiliary_task == 'MLM'
        gb = lambda batch: datasets.generate_batch(batch, self.vocab, self.
                                                   persona_vocab, is_mlm)

        dp = datasets.ChatDataProcesser(
            limit_length=args.limit_example_length,
            max_seq_length=model_config.max_seq_length,
            max_context_size=model_config.max_context_size,
            vocab=self.vocab,
            persona_vocab=self.persona_vocab,
            tokenizer=self.tokenizer)
        ds = utils.PersonaDataset(self.vocab,
                                  model_config.max_seq_length,
                                  args.limit_example_length,
                                  data_path=args.data_path,
                                  cache_path=args.cache_path,
                                  data_processer=dp,
                                  mode='test_char')
        self.test_iter = DataLoader(ds,
                                    batch_size=args.batch_size,
                                    collate_fn=gb,
                                    shuffle=False)
Esempio n. 2
0
    def build_dataloaders(self):
        args = self.args
        gb = lambda batch: datasets.generate_batch(batch, self.pad_idx)

        if args.n_epochs_early_stage > 0:
            ds = datasets.PersonaDataset(
                    self.vocab, args.max_seq_length, 
                    data_path=args.data_path, cache_path=args.cache_path, 
                    limit_length=args.limit_example_length, mode='early_stage_train')
            self.early_stage_train_iter = DataLoader(ds, batch_size=args.batch_size, 
                    collate_fn=gb, shuffle=args.shuffle_data) 

        ds = datasets.PersonaDataset(
                self.vocab, args.max_seq_length, 
                data_path=args.data_path, cache_path=args.cache_path, 
                limit_length=args.limit_example_length, mode='train')
        self.train_iter = DataLoader(ds, batch_size=args.batch_size, 
                collate_fn=gb, shuffle=args.shuffle_data) 
Esempio n. 3
0
    def build_dataloaders(self):
        args = self.args
        model_config = self.model_config
        gb = lambda batch: datasets.generate_batch(batch, self.pad_idx)

        dp = datasets.ChatDataProcesser(
            limit_length=args.limit_example_length,
            max_seq_length=args.max_seq_length,
            max_context_size=model_config.max_context_size)

        ds = utils.PersonaDataset(self.vocab,
                                  model_config.max_seq_length,
                                  args.limit_example_length,
                                  data_path=args.data_path,
                                  cache_path=args.cache_path,
                                  data_processer=dp,
                                  mode='test_char')
        self.test_iter = DataLoader(ds,
                                    batch_size=args.batch_size,
                                    collate_fn=gb,
                                    shuffle=False)
Esempio n. 4
0
    def build_dataloaders(self):
        args = self.args
        gb = lambda batch: datasets.generate_batch(batch, self.pad_idx)
        gb_lm = lambda batch: datasets.generate_lm_batch(batch, self.pad_idx)

        if args.n_epochs_early_stage > 0:
            dp = datasets.LMDataProcesser(
                limit_length=args.limit_example_length,
                max_seq_length=args.max_seq_length)
            ds = utils.PersonaDataset(self.vocab,
                                      args.max_seq_length,
                                      args.limit_example_length,
                                      data_path=args.data_path,
                                      cache_path=args.cache_path,
                                      data_processer=dp,
                                      mode='train_lm')
            self.train_iter = DataLoader(ds,
                                         batch_size=args.batch_size,
                                         collate_fn=gb_lm,
                                         shuffle=True)
        else:
            dp = datasets.ChatDataProcesser(
                limit_length=args.limit_example_length,
                max_seq_length=args.max_seq_length,
                max_context_size=args.max_context_size)
            ds = utils.PersonaDataset(self.vocab,
                                      args.max_seq_length,
                                      args.limit_example_length,
                                      data_path=args.data_path,
                                      cache_path=args.cache_path,
                                      data_processer=dp,
                                      mode='train_char')
            self.train_iter = DataLoader(ds,
                                         batch_size=args.batch_size,
                                         collate_fn=gb,
                                         shuffle=args.shuffle_data)

        self.valid_iter = None
        self.test_iter = None
        ds = utils.PersonaDataset(self.vocab,
                                  args.max_seq_length,
                                  args.limit_example_length,
                                  data_path=args.data_path,
                                  cache_path=args.cache_path,
                                  data_processer=dp,
                                  mode='valid_char')
        self.valid_iter = DataLoader(ds,
                                     batch_size=args.batch_size,
                                     collate_fn=gb,
                                     shuffle=args.shuffle_data)

        ds = utils.PersonaDataset(self.vocab,
                                  args.max_seq_length,
                                  args.limit_example_length,
                                  data_path=args.data_path,
                                  cache_path=args.cache_path,
                                  data_processer=dp,
                                  mode='test_char')
        self.test_iter = DataLoader(ds,
                                    batch_size=args.batch_size,
                                    collate_fn=gb,
                                    shuffle=args.shuffle_data)
Esempio n. 5
0
    def build_dataloaders(self):
        args = self.args
        is_mlm = self.args.auxiliary_task == 'MLM'
        gb = lambda batch: datasets.generate_batch(batch, self.vocab, self.
                                                   persona_vocab, is_mlm)
        gb_lm = lambda batch: datasets.generate_lm_batch(
            batch, self.vocab, is_mlm)

        if args.n_epochs_early_stage > 0:
            dp = datasets.LMDataProcesser(
                limit_length=args.limit_example_length,
                max_seq_length=args.max_seq_length,
                tokenizer=self.tokenizer)
            ds = utils.PersonaDataset(self.vocab,
                                      args.max_seq_length,
                                      args.limit_example_length,
                                      data_path=args.data_path,
                                      cache_path=args.cache_path,
                                      data_processer=dp,
                                      mode='train_lm')
            self.logger.info('---------------------------------')
            self.logger.info('datasets len: %s' % len(ds))
            self.train_iter = DataLoader(ds,
                                         batch_size=args.batch_size,
                                         collate_fn=gb_lm,
                                         shuffle=True)
        else:
            dp = datasets.ChatDataProcesser(
                limit_length=args.limit_example_length,
                max_seq_length=args.max_seq_length,
                max_context_size=args.max_context_size,
                vocab=self.vocab,
                persona_vocab=self.persona_vocab,
                tokenizer=self.tokenizer)
            ds = utils.PersonaDataset(self.vocab,
                                      args.max_seq_length,
                                      args.limit_example_length,
                                      data_path=args.data_path,
                                      cache_path=args.cache_path,
                                      data_processer=dp,
                                      mode='train_char')
            self.logger.info('---------------------------------')
            self.logger.info('datasets len: %s' % len(ds))
            # when Dataset is stream, try utils.DataLoaderX (prefetch_generator), https://github.com/IgorSusmelj/pytorch-styleguide/issues/5
            self.train_iter = DataLoader(ds,
                                         batch_size=args.batch_size,
                                         collate_fn=gb,
                                         shuffle=args.shuffle_data)

            dp = datasets.ChatDataProcesser(
                limit_length=args.limit_example_length,
                max_seq_length=args.max_seq_length,
                max_context_size=args.max_context_size,
                vocab=self.vocab,
                persona_vocab=self.persona_vocab,
                tokenizer=self.tokenizer)
            ds = utils.PersonaDataset(self.vocab,
                                      args.max_seq_length,
                                      args.limit_example_length,
                                      data_path=args.data_path,
                                      cache_path=args.cache_path,
                                      data_processer=dp,
                                      mode='valid_char')
            self.valid_iter = DataLoader(ds,
                                         batch_size=args.batch_size,
                                         collate_fn=gb,
                                         shuffle=False)

            dp = datasets.ChatDataProcesser(
                limit_length=args.limit_example_length,
                max_seq_length=args.max_seq_length,
                max_context_size=args.max_context_size,
                vocab=self.vocab,
                persona_vocab=self.persona_vocab,
                tokenizer=self.tokenizer)
            ds = utils.PersonaDataset(self.vocab,
                                      args.max_seq_length,
                                      args.limit_example_length,
                                      data_path=args.data_path,
                                      cache_path=args.cache_path,
                                      data_processer=dp,
                                      mode='test_char')
            self.test_iter = DataLoader(ds,
                                        batch_size=args.batch_size,
                                        collate_fn=gb,
                                        shuffle=False)