Esempio n. 1
0
 def test_sanskrit_nltk_tokenize_words(self):
     """Test wrapper for NLTK's PunktLanguageVars()"""
     tokens = nltk_tokenize_words("कृपया।",
                                  attached_period=False,
                                  language='sanskrit')
     target = ['कृपया', '।']
     self.assertEqual(tokens, target)
Esempio n. 2
0
 def test_sanskrit_nltk_tokenize_words_attached(self):
     """Test wrapper for NLTK's PunktLanguageVars(), returning unaltered output."""
     tokens = nltk_tokenize_words("कृपया।",
                                  attached_period=True,
                                  language='sanskrit')
     target = ['कृपया।']
     self.assertEqual(tokens, target)
Esempio n. 3
0
    def tokenize(self, mode='word'):
        """Tokenizes the passage into lists of words or sentences.

        Breaks text words into individual tokens (strings) by default. If
        mode is set to sentence, returns lists of sentences.

        Args:
            mode (:obj:`str`) Mode of tokenization, either 'word' or 'sentence'

        Returns:
            :obj:`list` of :obj:`str` Tokenized words (or sentences)

        Example:
            >>> LatinText('Gallia est omnis divisa in partes tres').tokenize()
            ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres']

        """
        from cltk.tokenize.word import nltk_tokenize_words
        from cltk.tokenize.sentence import TokenizeSentence
        if mode == 'sentence':
            return TokenizeSentence(
                self.options['language']
            ).tokenize_sentences(self.data)
        else:
            return nltk_tokenize_words(self.data)
Esempio n. 4
0
 def test_nltk_tokenize_words_assert(self):
     """Test assert error for CLTK's word tokenizer."""
     with self.assertRaises(AssertionError):
         nltk_tokenize_words(['Sentence', '1.'])
Esempio n. 5
0
 def test_nltk_tokenize_words_attached(self):
     """Test wrapper for NLTK's PunktLanguageVars(), returning unaltered output."""
     tokens = nltk_tokenize_words("Sentence 1. Sentence 2.", attached_period=True)
     target = ['Sentence', '1.', 'Sentence', '2.']
     self.assertEqual(tokens, target)
Esempio n. 6
0
 def test_nltk_tokenize_words(self):
     """Test wrapper for NLTK's PunktLanguageVars()"""
     tokens = nltk_tokenize_words("Sentence 1. Sentence 2.", attached_period=False)
     target = ['Sentence', '1', '.', 'Sentence', '2', '.']
     self.assertEqual(tokens, target)
Esempio n. 7
0
 def tokenize(self, mode='word'):
     if mode == 'sentence':
         return TokenizeSentence(self.language).tokenize_sentences(
             self.data)
     else:
         return nltk_tokenize_words(self.data)
Esempio n. 8
0
 def test_nltk_tokenize_words(self):
     """Test wrapper for NLTK's PunktLanguageVars()"""
     tokens = nltk_tokenize_words("Sentence 1. Sentence 2.", attached_period=False)
     target = ['Sentence', '1', '.', 'Sentence', '2', '.']
     self.assertEqual(tokens, target)
Esempio n. 9
0
 def test_nltk_tokenize_words_assert(self):
     """Test assert error for CLTK's word tokenizer."""
     with self.assertRaises(AssertionError):
         nltk_tokenize_words(['Sentence', '1.'])
Esempio n. 10
0
 def test_sanskrit_nltk_tokenize_words_attached(self):
     """Test wrapper for NLTK's PunktLanguageVars(), returning unaltered output."""
     tokens = nltk_tokenize_words("कृपया।", attached_period=True, language='sanskrit')
     target = ['कृपया।']
     self.assertEqual(tokens, target)
Esempio n. 11
0
 def test_sanskrit_nltk_tokenize_words(self):
     """Test wrapper for NLTK's PunktLanguageVars()"""
     tokens = nltk_tokenize_words("कृपया।", attached_period=False, language='sanskrit')
     target = ['कृपया', '।']
     self.assertEqual(tokens, target)
Esempio n. 12
0
 def test_nltk_tokenize_words_attached(self):
     """Test wrapper for NLTK's PunktLanguageVars(), returning unaltered output."""
     tokens = nltk_tokenize_words("Sentence 1. Sentence 2.", attached_period=True)
     target = ['Sentence', '1.', 'Sentence', '2.']
     self.assertEqual(tokens, target)