Example #1
0
 def test_tokenCleaner(self):
     sentence = "Hello I'm very happy 1313"
     goldenSentence = "hello"
     tokens = tokenize(sentence)
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     newTokens = tokenCleaner(tokens, ["stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording"])
     self.assertEqual(sentenize(newTokens), goldenSentence)
Example #2
0
 def test_removeSingleChar(self):
   sentence = "at 8 o'clock on (Thursday) morning Arthur didn't feel very good."
   goldenSentence = "at o'clock on Thursday morning Arthur did n't feel very good"
   tokens = tokenize(sentence)
   newTokens = []
   for token in tokens:
     newTokens.append(removeSingleChar(token))
   self.assertEqual(sentenize(newTokens), goldenSentence)
Example #3
0
 def test_removePunctuationAndNumbers(self):
   sentence = "at 8 o'clock on (thursday) morning Arthur didn't feel very good."
   goldenSentence = "at oclock on thursday morning Arthur did nt feel very good"
   tokens = tokenize(sentence)
   newTokens = []
   for token in tokens:
     newTokens.append(removePunctuationAndNumbers(token))
   self.assertEqual(sentenize(newTokens), goldenSentence)
Example #4
0
 def test_toLowerCase(self):
   sentence = "bOys WOMEN Homes MeN cARs aRchIvES"
   goldenSentence = "boys women homes men cars archives"
   tokens = tokenize(sentence)
   newTokens = []
   for token in tokens:
     newTokens.append(toLowerCase(token))
   self.assertEqual(sentenize(newTokens), goldenSentence)
Example #5
0
 def test_stemming(self):
   sentence = "boys and women want to have homes, not cars going and archives"
   goldenSentence = "boy and woman want to have home , not car go and archive"
   tokens = tokenize(sentence)
   newTokens = []
   for token in tokens:
     newTokens.append(stemming(token))
   self.assertEqual(sentenize(newTokens), goldenSentence)
Example #6
0
 def test_stopwording(self):
   sentence = "at eight not on thursday morning Arthur didn't feel very good"
   goldenSentence = "eight thursday morning Arthur n't feel good"
   language = 'english'
   stopwords = load_stopwords('etc/stopwords_en.txt')
   tokens = tokenize(sentence)
   newTokens = []
   for token in tokens:
     newTokens.append(stopwording(token) )
   self.assertEqual(sentenize(newTokens), goldenSentence)
Example #7
0
 def test_sentenize(self):
     sentence = "Hello I'm very happy 1313"
     goldenSentence = "Hello I 'm very happy 1313"
     tokens = tokenize(sentence)
     self.assertEqual(sentenize(tokens), goldenSentence)