def test_sanskrit_nltk_tokenize_words(self): """Test wrapper for NLTK's PunktLanguageVars()""" tokens = nltk_tokenize_words("कृपया।", attached_period=False, language='sanskrit') target = ['कृपया', '।'] self.assertEqual(tokens, target)
def test_sanskrit_nltk_tokenize_words_attached(self): """Test wrapper for NLTK's PunktLanguageVars(), returning unaltered output.""" tokens = nltk_tokenize_words("कृपया।", attached_period=True, language='sanskrit') target = ['कृपया।'] self.assertEqual(tokens, target)
def tokenize(self, mode='word'): """Tokenizes the passage into lists of words or sentences. Breaks text words into individual tokens (strings) by default. If mode is set to sentence, returns lists of sentences. Args: mode (:obj:`str`) Mode of tokenization, either 'word' or 'sentence' Returns: :obj:`list` of :obj:`str` Tokenized words (or sentences) Example: >>> LatinText('Gallia est omnis divisa in partes tres').tokenize() ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres'] """ from cltk.tokenize.word import nltk_tokenize_words from cltk.tokenize.sentence import TokenizeSentence if mode == 'sentence': return TokenizeSentence( self.options['language'] ).tokenize_sentences(self.data) else: return nltk_tokenize_words(self.data)
def test_nltk_tokenize_words_assert(self): """Test assert error for CLTK's word tokenizer.""" with self.assertRaises(AssertionError): nltk_tokenize_words(['Sentence', '1.'])
def test_nltk_tokenize_words_attached(self): """Test wrapper for NLTK's PunktLanguageVars(), returning unaltered output.""" tokens = nltk_tokenize_words("Sentence 1. Sentence 2.", attached_period=True) target = ['Sentence', '1.', 'Sentence', '2.'] self.assertEqual(tokens, target)
def test_nltk_tokenize_words(self): """Test wrapper for NLTK's PunktLanguageVars()""" tokens = nltk_tokenize_words("Sentence 1. Sentence 2.", attached_period=False) target = ['Sentence', '1', '.', 'Sentence', '2', '.'] self.assertEqual(tokens, target)
def tokenize(self, mode='word'): if mode == 'sentence': return TokenizeSentence(self.language).tokenize_sentences( self.data) else: return nltk_tokenize_words(self.data)