def test_cleanSentence(self): sentence = "At 8 o'clock on Thursday morning, the boys and girls didn't feel very good." sentenceProcList = ["removeUrl", "removeUserMention"] functions.stopwords = load_stopwords("etc/stopwords_en.txt") tokenProcList = [ "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", ] newSentence = cleanSentence(sentence, sentenceProcList, tokenProcList) goldSentence = "oclock thursday morning boy girl feel good" self.assertEqual(newSentence, goldSentence)
def test_cleanSentenceUnicode(self): sentence = ( u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf" ) sentenceProcList = ["removeUrl", "removeUserMention"] functions.stopwords = load_stopwords("etc/stopwords_en.txt") tokenProcList = [ "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", ] newSentence = cleanSentence(sentence, sentenceProcList, tokenProcList) goldSentence = u"según hay riesgo generalizado tsunami tras sismo japón" self.assertEqual(newSentence, goldSentence)