예제 #1
0
 def test_processFileUnicode(self):
     rawObject = {
         "date": u"Sun Aug 07 01:28:32 IST 2011",
         "id": u"100000335933878272",
         "user_id": u"71610408",
         "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf",
     }
     goldenRawObject = {
         "date": u"Sun Aug 07 01:28:32 IST 2011",
         "id": u"100000335933878272",
         "user_id": u"71610408",
         "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf",
         "status_clean": u"Según hay riesgo generalizado tsunami tras sismo Japón",
     }
     rawObjects = [rawObject]
     text_field = "status"
     new_text_field = "status_clean"
     sentence_proc_list = {"removeUrl", "removeUserMention"}
     token_proc_list = {
         "stemming",
         "toLowerCase",
         "removePunctuationAndNumbers",
         "stopwording",
         "removeSingleChar",
         "removeDoubleChar",
     }
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list)
     newRawObject = proc.processFile(rawObjects)
     self.assertEqual(rawObject, goldenRawObject)
예제 #2
0
 def test_processFile(self):
     rawObject = {
         "date": "Sun Aug 07 01:28:32 IST 2011",
         "id": "100000335933878272",
         "user_id": "71610408",
         "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i",
     }
     goldenRawObject = {
         "date": "Sun Aug 07 01:28:32 IST 2011",
         "id": "100000335933878272",
         "user_id": "71610408",
         "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i",
         "status_clean": "awesome amaze shin star merci baloji",
     }
     rawObjects = [rawObject]
     text_field = "status"
     new_text_field = "status_clean"
     sentence_proc_list = {"removeUrl", "removeUserMention"}
     token_proc_list = {
         "stemming",
         "toLowerCase",
         "removePunctuationAndNumbers",
         "stopwording",
         "removeSingleChar",
         "removeDoubleChar",
     }
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list)
     newRawObject = proc.processFile(rawObjects)
     self.assertEqual(rawObject, goldenRawObject)