def transform(self, data_pack: DataPack, verbose=1) -> DataPack: """ Apply transformation on data, create `letter-ngram` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() units = self._default_processor_units() data_pack.apply_on_text(chain_transform(units), inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) post_units = [processor_units.NgramLetterUnit(reduce_dim=False)] if self._with_word_hashing: term_index = self._context['vocab_unit'].state['term_index'] post_units.append(processor_units.WordHashingUnit(term_index)) data_pack.apply_on_text(chain_transform(post_units), inplace=True, verbose=verbose) return data_pack
def fit(self, data_pack: DataPack, verbose=1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:`BasicPreprocessor` instance. """ units = self._default_processor_units() data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, data_pack, flatten=False, mode='right', verbose=verbose) data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, mode='right', verbose=verbose) self._context['filter_unit'] = fitted_filter_unit vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit self._context['vocab_size'] = len(vocab_unit.state['term_index']) + 1 self._context['input_shapes'] = [(self._fixed_length_left, ), (self._fixed_length_right, )] return self
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create `tri-letter` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ units = self._default_processor_units() units.append(self._context['vocab_unit']) units.append(processor_units.FixedLengthUnit(text_length=30, pad_mode='post')) return data_pack.apply_on_text(chain_transform(units), verbose=verbose)
def fit(self, data_pack: DataPack, verbose: int = 1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:`NaivePreprocessor` instance. """ units = self._default_processor_units() data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit return self
def fit(self, data_pack: DataPack, verbose=1): """ Fit pre-processing context for transformation. :param verbose: Verbosity. :param data_pack: data_pack to be preprocessed. :return: class:`DSSMPreprocessor` instance. """ units = self._default_processor_units() data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit triletter_dim = len(vocab_unit.state['term_index']) + 1 self._context['input_shapes'] = [(triletter_dim, ), (triletter_dim, )] return self
def transform(self, data_pack: DataPack, verbose=1) -> DataPack: """ Apply transformation on data, create `tri-letter` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() units = self._default_processor_units() if self._with_word_hashing: term_index = self._context['vocab_unit'].state['term_index'] units.append(processor_units.WordHashingUnit(term_index)) data_pack.apply_on_text(chain_transform(units), inplace=True, verbose=verbose) return data_pack
def fit(self, data_pack: DataPack, verbose=1): """ Fit pre-processing context for transformation. :param verbose: Verbosity. :param data_pack: Data_pack to be preprocessed. :return: class:`CDSSMPreprocessor` instance. """ units = self._default_processor_units() units.append(processor_units.NgramLetterUnit()) data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit vocab_size = len(vocab_unit.state['term_index']) + 1 self._context['input_shapes'] = [(self._fixed_length_left, vocab_size), (self._fixed_length_right, vocab_size) ] return self
def transform(self, data_pack: DataPack, verbose=1) -> DataPack: """ Apply transformation on data, create fixed length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() units = self._default_processor_units() data_pack.apply_on_text(chain_transform(units), inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['filter_unit'].transform, mode='right', inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) max_len_left = self._fixed_length_left max_len_right = self._fixed_length_right data_pack.left['length_left'] = data_pack.left['length_left'].apply( lambda val: val if val <= max_len_left else max_len_left) data_pack.right['length_right'] = data_pack.right[ 'length_right'].apply(lambda val: val if val <= max_len_right else max_len_right) return data_pack