def test_tokenCleaner(self): sentence = "Hello I'm very happy 1313" goldenSentence = "hello" tokens = tokenize(sentence) functions.stopwords = load_stopwords("etc/stopwords_en.txt") newTokens = tokenCleaner(tokens, ["stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording"]) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_removeSingleChar(self): sentence = "at 8 o'clock on (Thursday) morning Arthur didn't feel very good." goldenSentence = "at o'clock on Thursday morning Arthur did n't feel very good" tokens = tokenize(sentence) newTokens = [] for token in tokens: newTokens.append(removeSingleChar(token)) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_removePunctuationAndNumbers(self): sentence = "at 8 o'clock on (thursday) morning Arthur didn't feel very good." goldenSentence = "at oclock on thursday morning Arthur did nt feel very good" tokens = tokenize(sentence) newTokens = [] for token in tokens: newTokens.append(removePunctuationAndNumbers(token)) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_toLowerCase(self): sentence = "bOys WOMEN Homes MeN cARs aRchIvES" goldenSentence = "boys women homes men cars archives" tokens = tokenize(sentence) newTokens = [] for token in tokens: newTokens.append(toLowerCase(token)) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_stemming(self): sentence = "boys and women want to have homes, not cars going and archives" goldenSentence = "boy and woman want to have home , not car go and archive" tokens = tokenize(sentence) newTokens = [] for token in tokens: newTokens.append(stemming(token)) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_stopwording(self): sentence = "at eight not on thursday morning Arthur didn't feel very good" goldenSentence = "eight thursday morning Arthur n't feel good" language = 'english' stopwords = load_stopwords('etc/stopwords_en.txt') tokens = tokenize(sentence) newTokens = [] for token in tokens: newTokens.append(stopwording(token) ) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_sentenize(self): sentence = "Hello I'm very happy 1313" goldenSentence = "Hello I 'm very happy 1313" tokens = tokenize(sentence) self.assertEqual(sentenize(tokens), goldenSentence)
def test_tokenize(self): sentence = "Hello didn't very happy 1313" goldenTokens = ["Hello", "did", "n't", "very", "happy", "1313"] tokens = tokenize(sentence) for i in range(0, len(tokens)): self.assertEqual(tokens[i][0], goldenTokens[i])