def transform(self, data_pack: DataPack, verbose=1) -> DataPack: """ Apply transformation on data, create `letter-ngram` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() units = self._default_processor_units() data_pack.apply_on_text(chain_transform(units), inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) post_units = [processor_units.NgramLetterUnit(reduce_dim=False)] if self._with_word_hashing: term_index = self._context['vocab_unit'].state['term_index'] post_units.append(processor_units.WordHashingUnit(term_index)) data_pack.apply_on_text(chain_transform(post_units), inplace=True, verbose=verbose) return data_pack
def _default_processor_units(cls) -> list: """Prepare needed process units.""" return [ processor_units.TokenizeUnit(), processor_units.LowercaseUnit(), processor_units.PuncRemovalUnit(), processor_units.StopRemovalUnit(), processor_units.NgramLetterUnit(), ]
def fit(self, data_pack: DataPack, verbose=1): """ Fit pre-processing context for transformation. :param verbose: Verbosity. :param data_pack: Data_pack to be preprocessed. :return: class:`CDSSMPreprocessor` instance. """ units = self._default_processor_units() units.append(processor_units.NgramLetterUnit()) data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit vocab_size = len(vocab_unit.state['term_index']) + 1 self._context['input_shapes'] = [(self._fixed_length_left, vocab_size), (self._fixed_length_right, vocab_size) ] return self