コード例 #1
0
    def process(self):
        tok = Tokenizer()

        # consider entire corpus as text ( train + test text columns )
        if self.test_csv:
            text = list(self.df.loc[:, self.text_cols].values) + list(
                self.test_df.loc[:, self.text_cols])
        else:
            text = list(self.df.loc[:, self.text_cols].values)

        self.tokens = [tok.tokenizer(x) for x in text]
        self.vocab = Vocab.create(self.tokens, self.max_vocab, self.min_freq)

        self.ntokens = [self.vocab.numericalize(t) for t in self.tokens]

        # only full training
        if self.valid_pct == 0 and self.test_csv is None:
            self.trn_ds = (self.ntokens, self.df.loc[:,
                                                     self.label_cols].values)
            self.vld_ds = ([], [])
            self.test_ds = ([], [])

        # holdout
        elif self.valid_pct > 0 and self.test_csv is None:
            self.trn_ds = (self.ntokens[self.cut:],
                           self.df.loc[:, self.label_cols].values[self.cut:])
            self.vld_ds = (self.ntokens[:self.cut],
                           self.df.loc[:, self.label_cols].values[:self.cut])
            self.tst_ds = ([], [])

        # holdout and test prediction
        elif self.valid_pct > 0 and self.test_csv is not None:
            self.trn_tokens = self.ntokens[:len(self.df)]
            self.tst_ds = (self.ntokens[len(self.df):], [])

            trn_tokens = self.trn_tokens[self.cut:]
            vld_tokens = self.trn_tokens[:self.cut]

            self.trn_ds = (trn_tokens,
                           self.df.loc[:, self.label_cols].values[self.cut:])
            self.vld_ds = (vld_tokens,
                           self.df.loc[:, self.label_cols].values[:self.cut])

        # full training and test prediction
        else:
            self.trn_ds = (self.ntokens[:len(self.df)],
                           self.df.loc[:, self.label_cols].values)
            self.vld_ds = ([], [])
            self.tst_ds = (self.ntokens[len(self.df):], [])

        return self.vocab, self.trn_ds, self.vld_ds, self.tst_ds