def get_default_preprocessor( cls, truncated_mode: str = 'pre', truncated_length_left: typing.Optional[int] = None, truncated_length_right: typing.Optional[int] = None, filter_mode: str = 'df', filter_low_freq: float = 1, filter_high_freq: float = float('inf'), remove_stop_words: bool = False, ngram_size: typing.Optional[int] = 3, ) -> BasePreprocessor: """ Model default preprocessor. The preprocessor's transform should produce a correctly shaped data pack that can be used for training. :return: Default preprocessor. """ return preprocessors.BasicPreprocessor( truncated_mode=truncated_mode, truncated_length_left=truncated_length_left, truncated_length_right=truncated_length_right, filter_mode=filter_mode, filter_low_freq=filter_low_freq, filter_high_freq=filter_high_freq, remove_stop_words=remove_stop_words, ngram_size=ngram_size )
def test_drmm_padding(train_raw): preprocessor = preprocessors.BasicPreprocessor() data_preprocessed = preprocessor.fit_transform(train_raw, verbose=0) embedding_matrix = load_from_file(embeddings.EMBED_10_GLOVE, mode='glove') term_index = preprocessor.context['vocab_unit'].state['term_index'] embedding_matrix = embedding_matrix.build_matrix(term_index) histgram_callback = callbacks.Histogram(embedding_matrix=embedding_matrix, bin_size=30, hist_mode='LCH') dataset = Dataset(data_preprocessed, mode='point', callbacks=[histgram_callback]) pre_fixed_padding = callbacks.DRMMPadding(fixed_length_left=5, fixed_length_right=5, pad_mode='pre') dataloader = DataLoader(dataset, batch_size=5, callback=pre_fixed_padding) for batch in dataloader: assert batch[0]['text_left'].shape == (5, 5) assert batch[0]['text_right'].shape == (5, 5) assert batch[0]['match_histogram'].shape == (5, 5, 30) post_padding = callbacks.DRMMPadding(pad_mode='post') dataloader = DataLoader(dataset, batch_size=5, callback=post_padding) for batch in dataloader: max_left_len = max(batch[0]['length_left'].detach().cpu().numpy()) max_right_len = max(batch[0]['length_right'].detach().cpu().numpy()) assert batch[0]['text_left'].shape == (5, max_left_len) assert batch[0]['text_right'].shape == (5, max_right_len) assert batch[0]['match_histogram'].shape == (5, max_left_len, 30)
def get_default_preprocessor(cls) -> BasePreprocessor: """ Model default preprocessor. The preprocessor's transform should produce a correctly shaped data pack that can be used for training. :return: Default preprocessor. """ return preprocessors.BasicPreprocessor()
def get_default_preprocessor(cls, truncated_mode: str = 'pre', truncated_length_left: int = 10, truncated_length_right: int = 40, filter_mode: str = 'df', filter_low_freq: float = 1, filter_high_freq: float = float('inf'), remove_stop_words: bool = False, ngram_size: int = 3): """:return: Default preprocessor.""" return preprocessors.BasicPreprocessor( truncated_mode=truncated_mode, truncated_length_left=truncated_length_left, truncated_length_right=truncated_length_right, filter_mode=filter_mode, filter_low_freq=filter_low_freq, filter_high_freq=filter_high_freq, remove_stop_words=remove_stop_words, ngram_size=ngram_size)
def test_basic_padding(train_raw): preprocessor = preprocessors.BasicPreprocessor() data_preprocessed = preprocessor.fit_transform(train_raw, verbose=0) dataset = Dataset(data_preprocessed, mode='point') pre_fixed_padding = callbacks.BasicPadding(fixed_length_left=5, fixed_length_right=5, pad_mode='pre') dataloader = DataLoader(dataset, batch_size=5, callback=pre_fixed_padding) for batch in dataloader: assert batch[0]['text_left'].shape == (5, 5) assert batch[0]['text_right'].shape == (5, 5) post_padding = callbacks.BasicPadding(pad_mode='post') dataloader = DataLoader(dataset, batch_size=5, callback=post_padding) for batch in dataloader: max_left_len = max(batch[0]['length_left'].numpy()) max_right_len = max(batch[0]['length_right'].numpy()) assert batch[0]['text_left'].shape == (5, max_left_len) assert batch[0]['text_right'].shape == (5, max_right_len)