def process(self, ds):
        items1 = _join_texts(ds.items[:, 0], self.mark_fields,
                             self.include_bos, self.include_eos)
        items2 = _join_texts(ds.items[:, 1], self.mark_fields,
                             self.include_bos, self.include_eos)
        ds.items = np.stack((items1, items2), axis=-1)

        tokens1 = []
        tokens2 = []
        for i in progress_bar(range(0, len(ds), self.chunksize), leave=False):
            tokens1 += self.tokenizer.process_all(ds.items[i:i +
                                                           self.chunksize, 0])
            tokens2 += self.tokenizer.process_all(ds.items[i:i +
                                                           self.chunksize, 1])
        ds.items = np.stack((tokens1, tokens2), axis=-1)
Ejemplo n.º 2
0
    def process(self, ds):
        #pdb.set_trace()
        # process tabular data and then set "preprocessed=False" since we still have text data possibly
        super().process(ds)
        ds.preprocessed = False

        # process text data from column(s) containing text
        if len(ds.text_cols) != 0:
            texts = _join_texts(ds.inner_df[ds.text_cols].values,
                                (len(ds.text_cols) > 1))

            # tokenize (set = .text)
            tokens = []
            for i in progress_bar(range(0, len(ds), self.chunksize),
                                  leave=False):
                tokens += self.tokenizer.process_all(texts[i:i +
                                                           self.chunksize])
            ds.text = tokens

            # set/build vocab
            if self.vocab is None:
                self.vocab = Vocab.create(ds.text, self.max_vocab,
                                          self.min_freq)
            ds.vocab = self.vocab
            ds.text_ids = [
                np.array(self.vocab.numericalize(toks), dtype=np.int64)
                for toks in ds.text
            ]
        else:
            ds.text, ds.vocab, ds.text_ids = None, None, []

        ds.preprocessed = True
Ejemplo n.º 3
0
    def process_one(self, item):  #item need to be type of df.Series
        # process tabular data (copied form tabular.data)
        df = pd.DataFrame([item, item])
        for proc in self.procs:
            proc(df, test=True)  #todo
        # for proc in self.procs:
        #     proc(df, True)
        if len(self.cat_names) != 0:
            codes = np.stack(
                [c.cat.codes.values for n, c in df[self.cat_names].items()],
                1).astype(np.int64) + 1
        else:
            codes = [[]]

        if len(self.cont_names) != 0:
            conts = np.stack([
                c.astype('float32').values
                for n, c in df[self.cont_names].items()
            ], 1)
        else:
            conts = [[]]
        classes = None
        col_names = list(df[self.cat_names].columns.values) + list(
            df[self.cont_names].columns.values)
        # above:  process tabular data (copied form tabular.data)

        # below: process textual data (add the customed code lines below)
        if len(self.txt_col_names) != 0:
            txt = _join_texts(df[self.txt_col_names].values,
                              (len(self.txt_col_names) > 1))
            txt_toks = self.tokenizer._process_all_1(txt)[0]
            txt_ids = np.array(self.vocab.numericalize(txt_toks),
                               dtype=np.int64)
        else:
            txt_toks, txt_ids = None, [[]]

        # return ItemBase
        return MixedTabularLine(codes[0], conts[0], classes, col_names,
                                txt_ids, self.txt_col_names, txt_toks)