Exemple #1
0
    def get_default_preprocessor(
        cls,
        truncated_mode: str = 'pre',
        truncated_length_left: typing.Optional[int] = None,
        truncated_length_right: typing.Optional[int] = None,
        filter_mode: str = 'df',
        filter_low_freq: float = 1,
        filter_high_freq: float = float('inf'),
        remove_stop_words: bool = False,
        ngram_size: typing.Optional[int] = 3,
    ) -> BasePreprocessor:
        """
        Model default preprocessor.

        The preprocessor's transform should produce a correctly shaped data
        pack that can be used for training.

        :return: Default preprocessor.
        """
        return preprocessors.BasicPreprocessor(
            truncated_mode=truncated_mode,
            truncated_length_left=truncated_length_left,
            truncated_length_right=truncated_length_right,
            filter_mode=filter_mode,
            filter_low_freq=filter_low_freq,
            filter_high_freq=filter_high_freq,
            remove_stop_words=remove_stop_words,
            ngram_size=ngram_size
        )
Exemple #2
0
def test_drmm_padding(train_raw):
    preprocessor = preprocessors.BasicPreprocessor()
    data_preprocessed = preprocessor.fit_transform(train_raw, verbose=0)

    embedding_matrix = load_from_file(embeddings.EMBED_10_GLOVE, mode='glove')
    term_index = preprocessor.context['vocab_unit'].state['term_index']
    embedding_matrix = embedding_matrix.build_matrix(term_index)
    histgram_callback = callbacks.Histogram(embedding_matrix=embedding_matrix,
                                            bin_size=30,
                                            hist_mode='LCH')
    dataset = Dataset(data_preprocessed,
                      mode='point',
                      callbacks=[histgram_callback])

    pre_fixed_padding = callbacks.DRMMPadding(fixed_length_left=5,
                                              fixed_length_right=5,
                                              pad_mode='pre')
    dataloader = DataLoader(dataset, batch_size=5, callback=pre_fixed_padding)
    for batch in dataloader:
        assert batch[0]['text_left'].shape == (5, 5)
        assert batch[0]['text_right'].shape == (5, 5)
        assert batch[0]['match_histogram'].shape == (5, 5, 30)

    post_padding = callbacks.DRMMPadding(pad_mode='post')
    dataloader = DataLoader(dataset, batch_size=5, callback=post_padding)
    for batch in dataloader:
        max_left_len = max(batch[0]['length_left'].detach().cpu().numpy())
        max_right_len = max(batch[0]['length_right'].detach().cpu().numpy())
        assert batch[0]['text_left'].shape == (5, max_left_len)
        assert batch[0]['text_right'].shape == (5, max_right_len)
        assert batch[0]['match_histogram'].shape == (5, max_left_len, 30)
    def get_default_preprocessor(cls) -> BasePreprocessor:
        """
        Model default preprocessor.

        The preprocessor's transform should produce a correctly shaped data
        pack that can be used for training.

        :return: Default preprocessor.
        """
        return preprocessors.BasicPreprocessor()
Exemple #4
0
 def get_default_preprocessor(cls,
                              truncated_mode: str = 'pre',
                              truncated_length_left: int = 10,
                              truncated_length_right: int = 40,
                              filter_mode: str = 'df',
                              filter_low_freq: float = 1,
                              filter_high_freq: float = float('inf'),
                              remove_stop_words: bool = False,
                              ngram_size: int = 3):
     """:return: Default preprocessor."""
     return preprocessors.BasicPreprocessor(
         truncated_mode=truncated_mode,
         truncated_length_left=truncated_length_left,
         truncated_length_right=truncated_length_right,
         filter_mode=filter_mode,
         filter_low_freq=filter_low_freq,
         filter_high_freq=filter_high_freq,
         remove_stop_words=remove_stop_words,
         ngram_size=ngram_size)
Exemple #5
0
def test_basic_padding(train_raw):
    preprocessor = preprocessors.BasicPreprocessor()
    data_preprocessed = preprocessor.fit_transform(train_raw, verbose=0)
    dataset = Dataset(data_preprocessed, mode='point')

    pre_fixed_padding = callbacks.BasicPadding(fixed_length_left=5,
                                               fixed_length_right=5,
                                               pad_mode='pre')
    dataloader = DataLoader(dataset, batch_size=5, callback=pre_fixed_padding)
    for batch in dataloader:
        assert batch[0]['text_left'].shape == (5, 5)
        assert batch[0]['text_right'].shape == (5, 5)

    post_padding = callbacks.BasicPadding(pad_mode='post')
    dataloader = DataLoader(dataset, batch_size=5, callback=post_padding)
    for batch in dataloader:
        max_left_len = max(batch[0]['length_left'].numpy())
        max_right_len = max(batch[0]['length_right'].numpy())
        assert batch[0]['text_left'].shape == (5, max_left_len)
        assert batch[0]['text_right'].shape == (5, max_right_len)