def process(self): tok = Tokenizer() # consider entire corpus as text ( train + test text columns ) if self.test_csv: text = list(self.df.loc[:, self.text_cols].values) + list( self.test_df.loc[:, self.text_cols]) else: text = list(self.df.loc[:, self.text_cols].values) self.tokens = [tok.tokenizer(x) for x in text] self.vocab = Vocab.create(self.tokens, self.max_vocab, self.min_freq) self.ntokens = [self.vocab.numericalize(t) for t in self.tokens] # only full training if self.valid_pct == 0 and self.test_csv is None: self.trn_ds = (self.ntokens, self.df.loc[:, self.label_cols].values) self.vld_ds = ([], []) self.test_ds = ([], []) # holdout elif self.valid_pct > 0 and self.test_csv is None: self.trn_ds = (self.ntokens[self.cut:], self.df.loc[:, self.label_cols].values[self.cut:]) self.vld_ds = (self.ntokens[:self.cut], self.df.loc[:, self.label_cols].values[:self.cut]) self.tst_ds = ([], []) # holdout and test prediction elif self.valid_pct > 0 and self.test_csv is not None: self.trn_tokens = self.ntokens[:len(self.df)] self.tst_ds = (self.ntokens[len(self.df):], []) trn_tokens = self.trn_tokens[self.cut:] vld_tokens = self.trn_tokens[:self.cut] self.trn_ds = (trn_tokens, self.df.loc[:, self.label_cols].values[self.cut:]) self.vld_ds = (vld_tokens, self.df.loc[:, self.label_cols].values[:self.cut]) # full training and test prediction else: self.trn_ds = (self.ntokens[:len(self.df)], self.df.loc[:, self.label_cols].values) self.vld_ds = ([], []) self.tst_ds = (self.ntokens[len(self.df):], []) return self.vocab, self.trn_ds, self.vld_ds, self.tst_ds