def test_sanity(self): self.assertEqual(preprocess.separate('how are you?', sep='?'), 'how are you ?') self.assertEqual( preprocess.separate('how are you,man?', sep=('?', ','), between_char=True), 'how are you , man ?') self.assertEqual(preprocess.separate('how are! you?'), 'how are ! you ?')
def separate(text: AnyStr, sep: Union[str, Sequence[str]] = ('!', '?', '.'), between_char=False) -> str: warnings.warn( f"This function will be deprecated in future versions. " f"preprocess.separate", DeprecationWarning, 2) return _preprocess.separate(text=text, sep=sep, between_char=between_char)
def test_sanity(self): self.assertEqual(preprocess.separate('how are you?', sep='?'), 'how are you ?') self.assertEqual( preprocess.separate('how are you,man?', sep=('?', ','), between_char=True), 'how are you , man ?') self.assertEqual(preprocess.separate('how are! you?'), 'how are ! you ?') freq = Freq([1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 'hi', 'o', 'a']) self.assertEqual(freq.sample(max_freq=1), { 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 'hi': 1, 'o': 1, 'a': 1 }) self.assertEqual(freq.sample(freq=2), {1: 2, 3: 2, 2: 2}) self.assertRaises(AssertionError, freq.sample, freq=1, max_freq=2) self.assertRaises(AssertionError, freq.sample, freq=1, min_freq=2) freq = Freq([1, 2, 3, 3, 4, 5, 6, 7, 6, 7, 12, 31, 123, 5, 3]) self.assertEqual(freq.least_freq(), { 123: 1, 31: 1, 12: 1, 4: 1, 2: 1, 1: 1, 7: 2, 6: 2, 5: 2, 3: 3 })
def _preprocess(self, sentence, is_destructive: bool): if is_destructive or self.config.to_lower: sentence = sentence.lower() sentence = _preprocess.remove_extra_chars(sentence) sentence = _preprocess.remove_non_language_elements(sentence) if self.config.name == 'en': sentence = _preprocess.replace_english_contractions(sentence) if is_destructive or self.config.is_remove_accent: sentence = _preprocess.accent_remove(sentence) sentence = _preprocess.separate(sentence) if is_destructive or self.config.is_remove_punctuation: sentence = _preprocess.remove_punctuation(sentence, self.config.punctuation) if is_destructive or self.config.is_remove_stop_words: sentence = ' '.join([w for w in sentence.split() if w not in self.config.stop_words]) return _preprocess.remove_extra_chars(sentence)