def test_processFileUnicode(self): rawObject = { "date": u"Sun Aug 07 01:28:32 IST 2011", "id": u"100000335933878272", "user_id": u"71610408", "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf", } goldenRawObject = { "date": u"Sun Aug 07 01:28:32 IST 2011", "id": u"100000335933878272", "user_id": u"71610408", "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf", "status_clean": u"Según hay riesgo generalizado tsunami tras sismo Japón", } rawObjects = [rawObject] text_field = "status" new_text_field = "status_clean" sentence_proc_list = {"removeUrl", "removeUserMention"} token_proc_list = { "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", } functions.stopwords = load_stopwords("etc/stopwords_en.txt") proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list) newRawObject = proc.processFile(rawObjects) self.assertEqual(rawObject, goldenRawObject)
def test_processFile(self): rawObject = { "date": "Sun Aug 07 01:28:32 IST 2011", "id": "100000335933878272", "user_id": "71610408", "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i", } goldenRawObject = { "date": "Sun Aug 07 01:28:32 IST 2011", "id": "100000335933878272", "user_id": "71610408", "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i", "status_clean": "awesome amaze shin star merci baloji", } rawObjects = [rawObject] text_field = "status" new_text_field = "status_clean" sentence_proc_list = {"removeUrl", "removeUserMention"} token_proc_list = { "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", } functions.stopwords = load_stopwords("etc/stopwords_en.txt") proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list) newRawObject = proc.processFile(rawObjects) self.assertEqual(rawObject, goldenRawObject)