def transform(self, data_pack: DataPack, verbose=1) -> DataPack:
        """
        Apply transformation on data, create `letter-ngram` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        units = self._default_processor_units()
        data_pack.apply_on_text(chain_transform(units),
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left',
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)
        post_units = [processor_units.NgramLetterUnit(reduce_dim=False)]
        if self._with_word_hashing:
            term_index = self._context['vocab_unit'].state['term_index']
            post_units.append(processor_units.WordHashingUnit(term_index))
        data_pack.apply_on_text(chain_transform(post_units),
                                inplace=True,
                                verbose=verbose)
        return data_pack
Beispiel #2
0
    def fit(self, data_pack: DataPack, verbose=1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:`BasicPreprocessor` instance.
        """
        units = self._default_processor_units()
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)

        fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
                                                       data_pack,
                                                       flatten=False,
                                                       mode='right',
                                                       verbose=verbose)
        data_pack = data_pack.apply_on_text(fitted_filter_unit.transform,
                                            mode='right',
                                            verbose=verbose)
        self._context['filter_unit'] = fitted_filter_unit

        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        self._context['vocab_unit'] = vocab_unit
        self._context['vocab_size'] = len(vocab_unit.state['term_index']) + 1

        self._context['input_shapes'] = [(self._fixed_length_left, ),
                                         (self._fixed_length_right, )]

        return self
Beispiel #3
0
    def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack:
        """
        Apply transformation on data, create `tri-letter` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        units = self._default_processor_units()
        units.append(self._context['vocab_unit'])
        units.append(processor_units.FixedLengthUnit(text_length=30,
                                                     pad_mode='post'))
        return data_pack.apply_on_text(chain_transform(units), verbose=verbose)
Beispiel #4
0
    def fit(self, data_pack: DataPack, verbose: int = 1):
        """
        Fit pre-processing context for transformation.

        :param data_pack: data_pack to be preprocessed.
        :param verbose: Verbosity.
        :return: class:`NaivePreprocessor` instance.
        """
        units = self._default_processor_units()
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)
        self._context['vocab_unit'] = vocab_unit
        return self
Beispiel #5
0
    def fit(self, data_pack: DataPack, verbose=1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: data_pack to be preprocessed.
        :return: class:`DSSMPreprocessor` instance.
        """
        units = self._default_processor_units()
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        triletter_dim = len(vocab_unit.state['term_index']) + 1
        self._context['input_shapes'] = [(triletter_dim, ), (triletter_dim, )]
        return self
Beispiel #6
0
    def transform(self, data_pack: DataPack, verbose=1) -> DataPack:
        """
        Apply transformation on data, create `tri-letter` representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        units = self._default_processor_units()
        if self._with_word_hashing:
            term_index = self._context['vocab_unit'].state['term_index']
            units.append(processor_units.WordHashingUnit(term_index))
        data_pack.apply_on_text(chain_transform(units),
                                inplace=True,
                                verbose=verbose)
        return data_pack
    def fit(self, data_pack: DataPack, verbose=1):
        """
        Fit pre-processing context for transformation.

        :param verbose: Verbosity.
        :param data_pack: Data_pack to be preprocessed.
        :return: class:`CDSSMPreprocessor` instance.
        """
        units = self._default_processor_units()
        units.append(processor_units.NgramLetterUnit())
        data_pack = data_pack.apply_on_text(chain_transform(units),
                                            verbose=verbose)
        vocab_unit = build_vocab_unit(data_pack, verbose=verbose)

        self._context['vocab_unit'] = vocab_unit
        vocab_size = len(vocab_unit.state['term_index']) + 1
        self._context['input_shapes'] = [(self._fixed_length_left, vocab_size),
                                         (self._fixed_length_right, vocab_size)
                                         ]
        return self
Beispiel #8
0
    def transform(self, data_pack: DataPack, verbose=1) -> DataPack:
        """
        Apply transformation on data, create fixed length representation.

        :param data_pack: Inputs to be preprocessed.
        :param verbose: Verbosity.

        :return: Transformed data as :class:`DataPack` object.
        """
        data_pack = data_pack.copy()
        units = self._default_processor_units()
        data_pack.apply_on_text(chain_transform(units),
                                inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._context['filter_unit'].transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)

        data_pack.apply_on_text(self._context['vocab_unit'].transform,
                                mode='both',
                                inplace=True,
                                verbose=verbose)
        data_pack.append_text_length(inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._left_fixedlength_unit.transform,
                                mode='left',
                                inplace=True,
                                verbose=verbose)
        data_pack.apply_on_text(self._right_fixedlength_unit.transform,
                                mode='right',
                                inplace=True,
                                verbose=verbose)
        max_len_left = self._fixed_length_left
        max_len_right = self._fixed_length_right
        data_pack.left['length_left'] = data_pack.left['length_left'].apply(
            lambda val: val if val <= max_len_left else max_len_left)
        data_pack.right['length_right'] = data_pack.right[
            'length_right'].apply(lambda val: val
                                  if val <= max_len_right else max_len_right)
        return data_pack