Esempio n. 1
0
    def test_add_entities_no_dups(self):
        doc = Document()

        e = Entity()
        e.group = 'group'
        e.name = 'name'

        de = DocumentEntity()
        de.entity = e
        de.relevance = 1.0
        de.count = 2

        doc.add_entity(de)
        self.assertEqual([de], list(doc.entities))


        e2 = Entity()
        e2.group = 'group'
        e2.name = 'name'

        de2 = DocumentEntity()
        de2.entity = e
        de2.relevance = 0.5
        de2.count = 3

        doc.add_entity(de2)
        # shouldn't add dup
        self.assertEqual([de], list(doc.entities))
Esempio n. 2
0
    def test_isolezwe(self):
        self.doc_info = {
            "id": 1667768,
            "category": "isolezwe",
            "title": "EzikaMalema zethule abazo eKZN",
            "published": "2014-03-28T09:57",
            "byline": "CELANI SIKHAKHANE noSIMPHIWE NGUBANE",
            "paragraphs": [
                "<strong>CELANI SIKHAKHANE noSIMPHIWE NGUBANE</strong> ",
                "<strong>I</strong>-Economic Freedom Fighters (EFF) KwaZulu-Natal ifolosa ngabesifazane nentsha ohlwini lwayo lwamagama okuqala ayishumi abaholi abazoyimela esiShayamthetho ngemva kokhetho lukazwelonke oluzoba ngoMeyi 7. ",
                "Lokhu kugqame izolo ngenkathi i-EEF yethula abaholi bayo abasohlwini lokhetho esithangamini nabezindaba eThekwini. ",
                "Inxusa leqembu nokubhekwe ukuthi libe ngundunankulu wase-KZN uma i-EFF inqoba, nguMnuz Vusi Khoza. UKhoza uke waba yikhansela le-ANC eThekwini nonobhala weNFP esifundazweni. ",
                "UKhoza uthe uhla lwabo lumele lonke uhlobo lwabantu abakhona  e-KZN ngokobulili, ukuxuba intsha nabadala. ",
                "UNksz Magdalene Moonsamy, obengumholi we-ANC Youth League nojutshwe ubuholi bukazwelonke esifundazweni, uthe iningi labaholi babo abazobamela eSishayamthetho banamakhono adingekayo futhi bayabethemba. ",
                "&#x201C;Abanye baholi bethu banomlando wokusebenzela abantu kusuka kwi-ANC nakwezinye izinhlaka njengoKhoza okunguye osimele njengondunankulu wesifundazwe,&#x201D; kusho uNksz Moonsamy. ",
                "Okufike kwagqama wukuthi ngesikhathi bebiza uKhoza abaholi bale nhlangano bebembiza ngondunankulu, yize kungakayiwa okhethweni. ",
                "Phakathi kwamagama asohlwini lokuqala olubizwa nge-<em>Top 10</em> uKhoza,uNksz Thembi Msane, uMnuz Lwazi Ntombela, uNksz Londiwe Mkhwanazi, uMnuz Nhlanhla Buthelezi, uNksz Sbongile Khawula, uVerusca Fynn, uMnuz Reggie Ngcobo,uNksz Cebisile Shangase, uMnuz Nkosinathi Mthethwa. ",
                "UNksz Moonsamy uthe abaholi babo bazimisele ukufunda okuningi njengoba beya eSishayamthetho. ",
                "UKhoza uthe okubalulekile kubo wukuthi banqobe zonke izihlalo ezingu-80 eziseSishayamthetho. ",
                "Ngaphandle kwamagama abaholi abayishumi abasohlwini , kuphinde kwethulwa nabanye abasohlwini lwesifundazwe abayingxenye yabantu abazoya eSishayamthetho. ",
                "Kulesi sithangami kuphinde kwethulwa uMnuz Vukani Ndlovu ngokusemthethweni obedume ebuholini be-ANC Youth League ngenkathi ehola isigungu sesikhashana sesifundazwe esihlakazwe ngonyaka odlule. ",
                "Abaholi baleli qembu bamile ekutheni bafuna ukusebenzisana nanoma yimuphi umbutho ozovumelana nabo ngemigomo ehambisana nokubuyiswa komhlaba ngaphandle kwesinxephezelo ukuthi uhlomulise bonke abantu bakuleli. ",
                "UNksz Moonsamy uthe bangasebenzisana neFreedom Front Plus inqobo nje uma izimisele ukuvumelana nabo ngodaba lomhlaba, ivume ukuthi kumele ubesezandleni zabantu, hhayi zedlanzana. ",
                "Abaholi baleli qembu abazange bacacise ukuthi banamalungu amangaki eKZN, bakhankasa nini nokuthi yiziphi izindlela abazisebenzisayo ukuxhumana nabantu njengoba bethi sebehlangane nabantu abaningi abampofu, abahola kancane, abasebenza ezindlini, abahlengikazi nabahlala emijondolo. ",
                "Bathi iKZN sebeyiphendule yabomvu njengoba bexhumana nabantu kuzona zonke izingxenye zesifundazwe. ",
                "UKhoza uthe abantu baseKZN basetshenziswa ngamaqembu ezepolitiki njengethuluzi lokuvota ukuze kucebe idlanzana eliphila ntofontofo. ",
                "Uphinde wathi igama lesifundazwe selingcolile  ezweni isidume njengendawo eyisizinda senkohlakalo  ngenxa yomuzi kaMengameli  Jacob Zuma owakhiwe ngoR246 million . ",
            ],
            "images": [
                {
                    "id": "copy-of-no-no-moonsamyy0-1.1667766/3851252182",
                    "credit": "INL SA",
                    "caption": "UNKSZ Magdalene Moonsamy, osuka kwi-ANC Youth League, ujutshwe yisigungu se-EFF kuzwelonke ukuthi eze KwaZulu-Natal",
                    "url": "http://www.iol.co.za/polopoly_fs/copy-of-no-no-moonsamyy0-1.1667766!/image/3851252182.jpg_gen/derivatives/box_300/3851252182.jpg",
                },
                {
                    "id": "copy-of-no-no-khoza0-1.1667767/4259951999",
                    "credit": "INLSA",
                    "caption": "UMNUZ Vusi Khoza, osuka kwi-ANC neNational Freedom Party, i-Economic Freedom Fighters imthwele ngeqoma ukuthi abe ngundunankulu eKZN",
                    "url": "http://www.iol.co.za/polopoly_fs/copy-of-no-no-khoza0-1.1667767!/image/4259951999.jpg_gen/derivatives/box_300/4259951999.jpg",
                },
            ],
        }

        doc = Document()
        doc.url = "http://www.iol.co.za/isolezwe/ezikamalema-zethule-abazo-ekzn-1.1667768#.UzvUR62SxWv"
        self.crawler.extract(doc, None)

        self.assertEqual(doc.title, "EzikaMalema zethule abazo eKZN")
        self.assertEqual(doc.summary, None)
        self.assertEqual(doc.published_at.strftime("%d %m %Y"), "28 03 2014")
        self.assertEqual(doc.author.name, "CELANI SIKHAKHANE noSIMPHIWE NGUBANE")
        self.assertEqual(doc.medium.name, "Isolezwe")

        self.assertEqual(
            doc.text,
            u"CELANI SIKHAKHANE noSIMPHIWE NGUBANE\n\nI-Economic Freedom Fighters (EFF) KwaZulu-Natal ifolosa ngabesifazane nentsha ohlwini lwayo lwamagama okuqala ayishumi abaholi abazoyimela esiShayamthetho ngemva kokhetho lukazwelonke oluzoba ngoMeyi 7.\n\nLokhu kugqame izolo ngenkathi i-EEF yethula abaholi bayo abasohlwini lokhetho esithangamini nabezindaba eThekwini.\n\nInxusa leqembu nokubhekwe ukuthi libe ngundunankulu wase-KZN uma i-EFF inqoba, nguMnuz Vusi Khoza. UKhoza uke waba yikhansela le-ANC eThekwini nonobhala weNFP esifundazweni.\n\nUKhoza uthe uhla lwabo lumele lonke uhlobo lwabantu abakhona  e-KZN ngokobulili, ukuxuba intsha nabadala.\n\nUNksz Magdalene Moonsamy, obengumholi we-ANC Youth League nojutshwe ubuholi bukazwelonke esifundazweni, uthe iningi labaholi babo abazobamela eSishayamthetho banamakhono adingekayo futhi bayabethemba.\n\n\u201cAbanye baholi bethu banomlando wokusebenzela abantu kusuka kwi-ANC nakwezinye izinhlaka njengoKhoza okunguye osimele njengondunankulu wesifundazwe,\u201d kusho uNksz Moonsamy.\n\nOkufike kwagqama wukuthi ngesikhathi bebiza uKhoza abaholi bale nhlangano bebembiza ngondunankulu, yize kungakayiwa okhethweni.\n\nPhakathi kwamagama asohlwini lokuqala olubizwa nge-Top 10 uKhoza,uNksz Thembi Msane, uMnuz Lwazi Ntombela, uNksz Londiwe Mkhwanazi, uMnuz Nhlanhla Buthelezi, uNksz Sbongile Khawula, uVerusca Fynn, uMnuz Reggie Ngcobo,uNksz Cebisile Shangase, uMnuz Nkosinathi Mthethwa.\n\nUNksz Moonsamy uthe abaholi babo bazimisele ukufunda okuningi njengoba beya eSishayamthetho.\n\nUKhoza uthe okubalulekile kubo wukuthi banqobe zonke izihlalo ezingu-80 eziseSishayamthetho.\n\nNgaphandle kwamagama abaholi abayishumi abasohlwini , kuphinde kwethulwa nabanye abasohlwini lwesifundazwe abayingxenye yabantu abazoya eSishayamthetho.\n\nKulesi sithangami kuphinde kwethulwa uMnuz Vukani Ndlovu ngokusemthethweni obedume ebuholini be-ANC Youth League ngenkathi ehola isigungu sesikhashana sesifundazwe esihlakazwe ngonyaka odlule.\n\nAbaholi baleli qembu bamile ekutheni bafuna ukusebenzisana nanoma yimuphi umbutho ozovumelana nabo ngemigomo ehambisana nokubuyiswa komhlaba ngaphandle kwesinxephezelo ukuthi uhlomulise bonke abantu bakuleli.\n\nUNksz Moonsamy uthe bangasebenzisana neFreedom Front Plus inqobo nje uma izimisele ukuvumelana nabo ngodaba lomhlaba, ivume ukuthi kumele ubesezandleni zabantu, hhayi zedlanzana.\n\nAbaholi baleli qembu abazange bacacise ukuthi banamalungu amangaki eKZN, bakhankasa nini nokuthi yiziphi izindlela abazisebenzisayo ukuxhumana nabantu njengoba bethi sebehlangane nabantu abaningi abampofu, abahola kancane, abasebenza ezindlini, abahlengikazi nabahlala emijondolo.\n\nBathi iKZN sebeyiphendule yabomvu njengoba bexhumana nabantu kuzona zonke izingxenye zesifundazwe.\n\nUKhoza uthe abantu baseKZN basetshenziswa ngamaqembu ezepolitiki njengethuluzi lokuvota ukuze kucebe idlanzana eliphila ntofontofo.\n\nUphinde wathi igama lesifundazwe selingcolile  ezweni isidume njengendawo eyisizinda senkohlakalo  ngenxa yomuzi kaMengameli  Jacob Zuma owakhiwe ngoR246 million .",
        )
    def test_extract_sources(self):
        d = Document()
        d.text = 'Fred Astair did something. He also did something else.'

        u = Utterance()
        u.entity = self.fx.EntityData.zuma
        u.document = d

        self.ex.extract_sources(d)
        self.assertEqual(['Jacob Zuma'], [s.person.name for s in d.sources])
Esempio n. 4
0
    def test_mediums(self):
        doc = Document()

        doc.url = 'http://www.iol.co.za/isolezwe/ezikamalema-zethule-abazo-ekzn-1.1667768#.Uzk8La2SxWu'
        self.assertEquals(self.crawler.identify_medium(doc).name, 'Isolezwe')

        doc.url = 'http://www.iol.co.za/news/politics/nkandla-job-not-finished-madonsela-1.1669787#.UzvP7K2SxWs'
        self.assertEquals(self.crawler.identify_medium(doc).name, 'IOL')

        doc.url = 'http://www.iol.co.za/news/politics/nkandla-job-not-finished-madonsela-1.1669787#.UzvP7K2SxWs'
        self.assertEquals(self.crawler.identify_medium(doc).name, 'IOL')
Esempio n. 5
0
    def test_mediums(self):
        doc = Document()

        doc.url = 'http://www.iol.co.za/isolezwe/ezikamalema-zethule-abazo-ekzn-1.1667768#.Uzk8La2SxWu'
        self.assertEquals(self.crawler.identify_medium(doc).name, 'Isolezwe')

        doc.url = 'http://www.iol.co.za/news/politics/nkandla-job-not-finished-madonsela-1.1669787#.UzvP7K2SxWs'
        self.assertEquals(self.crawler.identify_medium(doc).name, 'IOL')

        doc.url = 'http://www.iol.co.za/news/politics/nkandla-job-not-finished-madonsela-1.1669787#.UzvP7K2SxWs'
        self.assertEquals(self.crawler.identify_medium(doc).name, 'IOL')
    def test_guess_genders(self):
        d = Document()
        d.text = 'Fred Astair did something. He also did something else.'

        de = DocumentEntity()
        de.document = d
        de.entity = Entity.query.get(self.fx.EntityData.sue_no_gender.id)
        de.offset_list = '27:2'

        self.db.session.add(d)

        self.ex.guess_genders(d)
        self.assertEqual('Male', d.entities[0].entity.person.gender.name)
Esempio n. 7
0
    def test_isolezwe(self):
        self.doc_info = {"id":1667768,"category":"isolezwe","title":"EzikaMalema zethule abazo eKZN","published":"2014-03-28T09:57","byline":"CELANI SIKHAKHANE noSIMPHIWE NGUBANE","paragraphs":["<strong>CELANI SIKHAKHANE noSIMPHIWE NGUBANE</strong> ","<strong>I</strong>-Economic Freedom Fighters (EFF) KwaZulu-Natal ifolosa ngabesifazane nentsha ohlwini lwayo lwamagama okuqala ayishumi abaholi abazoyimela esiShayamthetho ngemva kokhetho lukazwelonke oluzoba ngoMeyi 7. ","Lokhu kugqame izolo ngenkathi i-EEF yethula abaholi bayo abasohlwini lokhetho esithangamini nabezindaba eThekwini. ","Inxusa leqembu nokubhekwe ukuthi libe ngundunankulu wase-KZN uma i-EFF inqoba, nguMnuz Vusi Khoza. UKhoza uke waba yikhansela le-ANC eThekwini nonobhala weNFP esifundazweni. ","UKhoza uthe uhla lwabo lumele lonke uhlobo lwabantu abakhona  e-KZN ngokobulili, ukuxuba intsha nabadala. ","UNksz Magdalene Moonsamy, obengumholi we-ANC Youth League nojutshwe ubuholi bukazwelonke esifundazweni, uthe iningi labaholi babo abazobamela eSishayamthetho banamakhono adingekayo futhi bayabethemba. ","&#x201C;Abanye baholi bethu banomlando wokusebenzela abantu kusuka kwi-ANC nakwezinye izinhlaka njengoKhoza okunguye osimele njengondunankulu wesifundazwe,&#x201D; kusho uNksz Moonsamy. ","Okufike kwagqama wukuthi ngesikhathi bebiza uKhoza abaholi bale nhlangano bebembiza ngondunankulu, yize kungakayiwa okhethweni. ","Phakathi kwamagama asohlwini lokuqala olubizwa nge-<em>Top 10</em> uKhoza,uNksz Thembi Msane, uMnuz Lwazi Ntombela, uNksz Londiwe Mkhwanazi, uMnuz Nhlanhla Buthelezi, uNksz Sbongile Khawula, uVerusca Fynn, uMnuz Reggie Ngcobo,uNksz Cebisile Shangase, uMnuz Nkosinathi Mthethwa. ","UNksz Moonsamy uthe abaholi babo bazimisele ukufunda okuningi njengoba beya eSishayamthetho. ","UKhoza uthe okubalulekile kubo wukuthi banqobe zonke izihlalo ezingu-80 eziseSishayamthetho. ","Ngaphandle kwamagama abaholi abayishumi abasohlwini , kuphinde kwethulwa nabanye abasohlwini lwesifundazwe abayingxenye yabantu abazoya eSishayamthetho. ","Kulesi sithangami kuphinde kwethulwa uMnuz Vukani Ndlovu ngokusemthethweni obedume ebuholini be-ANC Youth League ngenkathi ehola isigungu sesikhashana sesifundazwe esihlakazwe ngonyaka odlule. ","Abaholi baleli qembu bamile ekutheni bafuna ukusebenzisana nanoma yimuphi umbutho ozovumelana nabo ngemigomo ehambisana nokubuyiswa komhlaba ngaphandle kwesinxephezelo ukuthi uhlomulise bonke abantu bakuleli. ","UNksz Moonsamy uthe bangasebenzisana neFreedom Front Plus inqobo nje uma izimisele ukuvumelana nabo ngodaba lomhlaba, ivume ukuthi kumele ubesezandleni zabantu, hhayi zedlanzana. ","Abaholi baleli qembu abazange bacacise ukuthi banamalungu amangaki eKZN, bakhankasa nini nokuthi yiziphi izindlela abazisebenzisayo ukuxhumana nabantu njengoba bethi sebehlangane nabantu abaningi abampofu, abahola kancane, abasebenza ezindlini, abahlengikazi nabahlala emijondolo. ","Bathi iKZN sebeyiphendule yabomvu njengoba bexhumana nabantu kuzona zonke izingxenye zesifundazwe. ","UKhoza uthe abantu baseKZN basetshenziswa ngamaqembu ezepolitiki njengethuluzi lokuvota ukuze kucebe idlanzana eliphila ntofontofo. ","Uphinde wathi igama lesifundazwe selingcolile  ezweni isidume njengendawo eyisizinda senkohlakalo  ngenxa yomuzi kaMengameli  Jacob Zuma owakhiwe ngoR246 million . "],"images":[{"id":"copy-of-no-no-moonsamyy0-1.1667766/3851252182","credit":"INL SA","caption":"UNKSZ Magdalene Moonsamy, osuka kwi-ANC Youth League, ujutshwe yisigungu se-EFF kuzwelonke ukuthi eze KwaZulu-Natal","url":"http://www.iol.co.za/polopoly_fs/copy-of-no-no-moonsamyy0-1.1667766!/image/3851252182.jpg_gen/derivatives/box_300/3851252182.jpg"},{"id":"copy-of-no-no-khoza0-1.1667767/4259951999","credit":"INLSA","caption":"UMNUZ Vusi Khoza, osuka kwi-ANC neNational Freedom Party, i-Economic Freedom Fighters imthwele ngeqoma ukuthi abe ngundunankulu eKZN","url":"http://www.iol.co.za/polopoly_fs/copy-of-no-no-khoza0-1.1667767!/image/4259951999.jpg_gen/derivatives/box_300/4259951999.jpg"}]}

        doc = Document()
        doc.url = 'http://www.iol.co.za/isolezwe/ezikamalema-zethule-abazo-ekzn-1.1667768#.UzvUR62SxWv'
        self.crawler.extract(doc, None)

        self.assertEqual(doc.title, 'EzikaMalema zethule abazo eKZN')
        self.assertEqual(doc.summary, None)
        self.assertEqual(doc.published_at.strftime('%d %m %Y'), '28 03 2014')
        self.assertEqual(doc.author.name, 'CELANI SIKHAKHANE noSIMPHIWE NGUBANE')
        self.assertEqual(doc.medium.name, 'Isolezwe')

        self.assertEqual(doc.text, u'CELANI SIKHAKHANE noSIMPHIWE NGUBANE\n\nI-Economic Freedom Fighters (EFF) KwaZulu-Natal ifolosa ngabesifazane nentsha ohlwini lwayo lwamagama okuqala ayishumi abaholi abazoyimela esiShayamthetho ngemva kokhetho lukazwelonke oluzoba ngoMeyi 7.\n\nLokhu kugqame izolo ngenkathi i-EEF yethula abaholi bayo abasohlwini lokhetho esithangamini nabezindaba eThekwini.\n\nInxusa leqembu nokubhekwe ukuthi libe ngundunankulu wase-KZN uma i-EFF inqoba, nguMnuz Vusi Khoza. UKhoza uke waba yikhansela le-ANC eThekwini nonobhala weNFP esifundazweni.\n\nUKhoza uthe uhla lwabo lumele lonke uhlobo lwabantu abakhona  e-KZN ngokobulili, ukuxuba intsha nabadala.\n\nUNksz Magdalene Moonsamy, obengumholi we-ANC Youth League nojutshwe ubuholi bukazwelonke esifundazweni, uthe iningi labaholi babo abazobamela eSishayamthetho banamakhono adingekayo futhi bayabethemba.\n\n\u201cAbanye baholi bethu banomlando wokusebenzela abantu kusuka kwi-ANC nakwezinye izinhlaka njengoKhoza okunguye osimele njengondunankulu wesifundazwe,\u201d kusho uNksz Moonsamy.\n\nOkufike kwagqama wukuthi ngesikhathi bebiza uKhoza abaholi bale nhlangano bebembiza ngondunankulu, yize kungakayiwa okhethweni.\n\nPhakathi kwamagama asohlwini lokuqala olubizwa nge-Top 10 uKhoza,uNksz Thembi Msane, uMnuz Lwazi Ntombela, uNksz Londiwe Mkhwanazi, uMnuz Nhlanhla Buthelezi, uNksz Sbongile Khawula, uVerusca Fynn, uMnuz Reggie Ngcobo,uNksz Cebisile Shangase, uMnuz Nkosinathi Mthethwa.\n\nUNksz Moonsamy uthe abaholi babo bazimisele ukufunda okuningi njengoba beya eSishayamthetho.\n\nUKhoza uthe okubalulekile kubo wukuthi banqobe zonke izihlalo ezingu-80 eziseSishayamthetho.\n\nNgaphandle kwamagama abaholi abayishumi abasohlwini , kuphinde kwethulwa nabanye abasohlwini lwesifundazwe abayingxenye yabantu abazoya eSishayamthetho.\n\nKulesi sithangami kuphinde kwethulwa uMnuz Vukani Ndlovu ngokusemthethweni obedume ebuholini be-ANC Youth League ngenkathi ehola isigungu sesikhashana sesifundazwe esihlakazwe ngonyaka odlule.\n\nAbaholi baleli qembu bamile ekutheni bafuna ukusebenzisana nanoma yimuphi umbutho ozovumelana nabo ngemigomo ehambisana nokubuyiswa komhlaba ngaphandle kwesinxephezelo ukuthi uhlomulise bonke abantu bakuleli.\n\nUNksz Moonsamy uthe bangasebenzisana neFreedom Front Plus inqobo nje uma izimisele ukuvumelana nabo ngodaba lomhlaba, ivume ukuthi kumele ubesezandleni zabantu, hhayi zedlanzana.\n\nAbaholi baleli qembu abazange bacacise ukuthi banamalungu amangaki eKZN, bakhankasa nini nokuthi yiziphi izindlela abazisebenzisayo ukuxhumana nabantu njengoba bethi sebehlangane nabantu abaningi abampofu, abahola kancane, abasebenza ezindlini, abahlengikazi nabahlala emijondolo.\n\nBathi iKZN sebeyiphendule yabomvu njengoba bexhumana nabantu kuzona zonke izingxenye zesifundazwe.\n\nUKhoza uthe abantu baseKZN basetshenziswa ngamaqembu ezepolitiki njengethuluzi lokuvota ukuze kucebe idlanzana eliphila ntofontofo.\n\nUphinde wathi igama lesifundazwe selingcolile  ezweni isidume njengendawo eyisizinda senkohlakalo  ngenxa yomuzi kaMengameli  Jacob Zuma owakhiwe ngoR246 million .')
Esempio n. 8
0
    def test_extract(self):
        self.doc_info = {"id":1335083,"category":"motoring/industry-news","title":"Cellphone crackdown - first busts","published":"2012-07-05T12:10","byline":"Murray Williams","paragraphs":["The first cellphone confiscation has taken place in Cape Town, as the new law was enforced for the first time. ","As of today, drivers who are caught talking on their phones while driving, without headsets or hands-free kits, will have their handsets confiscated by traffic officers. ","The first driver caught this morning was Jean-Benoit Biyoko, a taxi driver. Traffic officers nabbed him in Long Street in the CBD. ","The traffic service&apos;s Maxine Jordaan reported: &#x201C;He didn&apos;t have a driving licence on him and he was taken to Gallows Hill traffic department. ","&#x201C;His cellphone, a Nokia E63, had a SIM card and memory card in it, which he kept, and he kept his pouch too. ","<strong>&#x201C;The gentleman was very co-operative and said he was sorry.&#x201D;</strong> ","Jordaan said Biyoko had been fined R500 for talking on his cellphone while driving and would be permitted to collect his phone 24 hours after it was confiscated - on Friday morning. ","The new City law was to be enforced across the city this afternoon, with officers from the undercover &apos;Ghost Squad&apos;, and other officers, deployed on major commuter routes. ","Officers&apos; vehicles will carry special boxes, in which confiscated phones will be placed once they have been logged and sealed in protective pouches. They will then be stored in the traffic department&apos;s safe at Gallows Hill. ","<strong>No fee is required to reclaim a confiscated phone.</strong> ","The bylaw was introduced by safety and security Mayco member JP Smith, who has received praise from across the country for the action against drivers who continue to flout the law. ","Smith said camera and video evidence would be used whenever possible to back up officers&apos; observations. ","&#x201C;We&apos;re hoping that everybody will finally get the message, grab those hands-free kits and start using cellphones legally&#x201D;, Smith said. ","&#x201C;We issue a minimum of 8000 fines a month for illegal cellphone use while driving. But it&apos;s not changing behaviour, so we must find a more powerful disincentive. ","&#x201C;Illegal cellphone use is classified as &apos;distracted driving&apos; and is one of the four ost dangerous driving habits, with speeding, drinking and driving and not wearing seatbelts.&#x201D; - Cape Argus ","<a href=\"http://www.iol.co.za/newsletters\" target=\"_blank\">Motoring newsletter - click here to keep up to speed with the best in motoring</a>","<hr>"],"images":[{"id":"iol-mot-pic-jul5-cell-phone-impoundment-1-1.1335082/2802609312","credit":"INLSA","caption":"Taxi driver Jean-Benoit Biyoko was caught talking on his phone while driving in Long Street. Picture: Henk Kruger","url":"http://www.iol.co.za/polopoly_fs/iol-mot-pic-jul5-cell-phone-impoundment-1-1.1335082!/image/2802609312.jpg_gen/derivatives/box_300/2802609312.jpg"}],"relatedStories":[{"link":"http://www.iol.co.za/motoring/industry-news/cellphone-use-it-you-ll-lose-it-1.1333846","title":"Cellphone: Use it, you'll lose it!"},{"link":"http://www.iol.co.za/news/south-africa/kwazulu-natal/kzn-drivers-could-also-lose-phones-1.1333884","title":"KZN drivers could also lose phones"}],"description":"Cape Town cops have begun confiscating cellphones from drivers not using hands-free kits."}
        
        doc = Document()
        doc.url = 'http://www.iol.co.za/motoring/industry-news/cellphone-crackdown-first-busts-1.1335083#.UzvU562SxWs'

        self.crawler.extract(doc, None)

        self.assertEqual(doc.title, 'Cellphone crackdown - first busts')
        self.assertEqual(doc.summary, 'Cape Town cops have begun confiscating cellphones from drivers not using hands-free kits.')
        self.assertEqual(doc.published_at.strftime('%d %m %Y'), '05 07 2012')
        self.assertEqual(doc.author.name, 'Murray Williams')
        self.assertEqual(doc.medium.name, 'IOL')

        self.assertEqual(doc.text, u"The first cellphone confiscation has taken place in Cape Town, as the new law was enforced for the first time.\n\nAs of today, drivers who are caught talking on their phones while driving, without headsets or hands-free kits, will have their handsets confiscated by traffic officers.\n\nThe first driver caught this morning was Jean-Benoit Biyoko, a taxi driver. Traffic officers nabbed him in Long Street in the CBD.\n\nThe traffic service's Maxine Jordaan reported: \u201cHe didn't have a driving licence on him and he was taken to Gallows Hill traffic department.\n\n\u201cHis cellphone, a Nokia E63, had a SIM card and memory card in it, which he kept, and he kept his pouch too.\n\n\u201cThe gentleman was very co-operative and said he was sorry.\u201d\n\nJordaan said Biyoko had been fined R500 for talking on his cellphone while driving and would be permitted to collect his phone 24 hours after it was confiscated - on Friday morning.\n\nThe new City law was to be enforced across the city this afternoon, with officers from the undercover 'Ghost Squad', and other officers, deployed on major commuter routes.\n\nOfficers' vehicles will carry special boxes, in which confiscated phones will be placed once they have been logged and sealed in protective pouches. They will then be stored in the traffic department's safe at Gallows Hill.\n\nNo fee is required to reclaim a confiscated phone.\n\nThe bylaw was introduced by safety and security Mayco member JP Smith, who has received praise from across the country for the action against drivers who continue to flout the law.\n\nSmith said camera and video evidence would be used whenever possible to back up officers' observations.\n\n\u201cWe're hoping that everybody will finally get the message, grab those hands-free kits and start using cellphones legally\u201d, Smith said.\n\n\u201cWe issue a minimum of 8000 fines a month for illegal cellphone use while driving. But it's not changing behaviour, so we must find a more powerful disincentive.\n\n\u201cIllegal cellphone use is classified as 'distracted driving' and is one of the four ost dangerous driving habits, with speeding, drinking and driving and not wearing seatbelts.\u201d - Cape Argus\n\nMotoring newsletter - click here to keep up to speed with the best in motoring\n\n")
Esempio n. 9
0
    def test_add_utterance(self):
        doc = Document()
        doc.text = 'And Fred said "Hello" to everyone.'
        
        u = Utterance()
        u.entity = Entity()
        u.entity.group = 'person'
        u.entity.name = 'Fred'
        u.quote = 'Hello'

        self.assertTrue(doc.add_utterance(u))
        self.assertTrue(u in doc.utterances)

        # can't add twice
        self.assertFalse(doc.add_utterance(u))
        self.assertEqual(1, len(doc.utterances))
Esempio n. 10
0
    def test_match_people(self):
        d = Document()
        d.text = 'Fred Astair did something. He also did something else.'

        u = Utterance()
        u.entity = Entity()
        u.entity.group = 'people'
        u.entity.name = 'Jacob Zume' # will fix to zuma
        u.document = d

        u2 = Utterance()
        u2.entity = Entity()
        u2.entity.group = 'people'
        u2.entity.name = 'Jacob Zooma' # too different
        u2.document = d

        self.ex.discover_people(d)
        self.assertEqual('Jacob Zuma', u.entity.person.name)
        self.assertIsNone(u2.entity.person)
Esempio n. 11
0
    def setUp(self):
        self.db = db
        self.db.drop_all()
        self.db.create_all()
        seed_db(db)

        AlchemyExtractor.API_KEY = 'fake'
        self.ex = AlchemyExtractor()
        self.doc = Document()
        self.doc.text = 'foo'
Esempio n. 12
0
    def make_docs(self):
        docs = [Document(url='foo-%s' % i) for i in xrange(3)]
        for d in docs:
            d.published_at = datetime.datetime.now()
            d.medium = Medium.query.first()
            d.author = Author.query.first()
            d.country = Country.query.first()

        # get ids
        db.session.add_all(docs)
        db.session.flush()
        return docs
Esempio n. 13
0
    def test_add_utterance_similar(self):
        doc = Document()
        doc.text = 'And Fred said "Hello there guys," to everyone.'
        
        u = Utterance()
        u.entity = Entity()
        u.entity.group = 'person'
        u.entity.name = 'Fred'
        u.quote = 'Hello there guys'

        self.assertTrue(doc.add_utterance(u))
        self.assertTrue(u in doc.utterances)

        # can't add similar quotations twice
        u2 = Utterance()
        u2.entity = Entity()
        u2.entity.group = 'person'
        u2.entity.name = 'Fred'
        u2.quote = '\"Hello there guys,\" ...'

        self.assertFalse(doc.add_utterance(u2))
        self.assertEqual(1, len(doc.utterances))
Esempio n. 14
0
    def test_delete_document(self):
        doc = Document()
        doc.text = 'And Fred said "Hello" to everyone.'
        doc.published_at = datetime.datetime.utcnow()
        
        u = Utterance()
        u.entity = Entity()
        u.entity.group = 'person'
        u.entity.name = 'Fred'
        u.quote = 'Hello'
        self.assertTrue(doc.add_utterance(u))

        de = DocumentEntity()
        de.document = doc
        de.entity = Entity.query.first()
        de.relevance = 0.5

        self.db.session.add(doc)
        self.db.session.commit()

        self.db.session.delete(doc)
        self.db.session.commit()
Esempio n. 15
0
    def test_add_utterance_update_offset(self):
        doc = Document()
        doc.text = 'And Fred said "Hello" to everyone.'
        
        u = Utterance()
        u.entity = Entity()
        u.entity.group = 'person'
        u.entity.name = 'Fred'
        u.quote = 'Hello'
        self.assertTrue(doc.add_utterance(u))

        u2 = Utterance()
        u2.entity = Entity()
        u2.entity.group = 'person'
        u2.entity.name = 'Fred'
        u2.quote = 'Hello'
        u2.offset = 10
        u2.length = 5

        self.assertTrue(doc.add_utterance(u2))
        self.assertEqual(10, u.offset)
        self.assertEqual(5, u.length)

        self.assertFalse(doc.add_utterance(u2))
Esempio n. 16
0
    def test_word_count(self):
        d = Document()

        d.text = "    test \n  \t one"
        self.assertEqual(d.word_count, 2)

        d.text = "    test \n  \t"
        self.assertEqual(d.word_count, 1)

        d.text = ""
        self.assertEqual(d.word_count, 0)

        d.text = None
        self.assertIsNone(d.word_count)
Esempio n. 17
0
    def test_extract(self):
        html = """

<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd"><html>
	<head>
		<meta http-equiv="expires" content="-1">
		<meta http-equiv="Cache-Control" content="no-cache">
		<meta http-equiv="Pragma" content="no-cache">
		<meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
		<title>Daily Sun Mobi</title>
		<link rel="stylesheet" type="text/css" href="/mobile/cached/640x480/css/reset.css" />		<link rel="stylesheet" type="text/css" href="/mobile/cached/640x480/css/mobile/default.css" />				
	<link rel="stylesheet" type="text/css" href="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/css/70/default.css" />
	<link rel="stylesheet" type="text/css" href="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/css/70/touch.css" />
		
					<style type="text/css">
				body { width: 640px; margin: 0 auto; }
			</style>
		
		
                    <script type="text/javascript"><!--
                var xtkey=false;
                if(document.addEventListener){document.addEventListener('keydown',function(){xtkey=true},false);document.addEventListener('keyup',function(){xtkey=false},false);}
                else if(document.attachEvent){document.attachEvent('onkeydown',function(){xtkey=true});document.attachEvent('onkeyup',function(){xtkey=false});}
                function xt_click(obj,type,section,page,x1,x2,x3,x4,x5){var xtImg=new Image(),xtDate=new Date(),xtScr=window.screen,xtNav=window.navigator,xtObj=null;var xtSrc='http://logw310.ati-host.net/hit.xiti?s=xxxxxx&s2='+section+'&p='+page+((type=='F')?'':(type=='M')?'&a='+x1+'&m1='+x2+'&m2='+x3+'&m3='+x4+'&m4='+x5:'&clic='+x1)+'&hl='+xtDate.getHours()+'x'+xtDate.getMinutes()+'x'+xtDate.getSeconds();if(parseFloat(xtNav.appVersion)>=4){xtSrc+='&r='+xtScr.width+'x'+xtScr.height+'x'+xtScr.pixelDepth+'x'+xtScr.colorDepth;}xtImg.src=xtSrc;xtImg.onload=function(){xtImg.onload=null;};if(obj.nodeName!='A'){var xelp=obj.parentNode;while(xelp){if(xelp.nodeName=='A'){xtObj=xelp;break;}xelp=xelp.parentNode;}}
                    else{xtObj=obj;}if(xtObj){xtObj.target=xtObj.target||'_self';if(x2&&(type=='C')){xtObj.href=x2;if(x3){xtObj.target='_blank';}else{xtObj.target='_self';}}if(!xtkey){if(xtObj.target.toLowerCase()=='_self'){setTimeout('self.location.href="'+xtObj.href+'"',500);return false;}else
                    if(xtObj.target.toLowerCase()=='_top'){setTimeout('top.location.href="'+xtObj.href+'"',500);return false;}else
                    if(xtObj.target.toLowerCase()=='_parent'){setTimeout('parent.location.href="'+xtObj.href+'"',500);return false;}}}else
                    if(x2&&(type=='C')){if(x3){setTimeout('(window.open("'+x2+'","_blank")).focus();',500);}else{setTimeout('self.location.href="'+x2+'"',500);}}xtkey=false;return true;}
                //-->
            </script>
            	</head>
<body>
	
	<script type="text/javascript" src="/js/mobile/gears_init.js"></script>
	<script type="text/javascript" src="/js/mobile/xui.js"></script>
	<script type="text/javascript" src="/js/mobile/plugins.js"></script>
	<script type="text/javascript" src="/js/mobile/custom.js"></script>
	<script type="text/javascript" src="http://code.jquery.com/jquery-2.0.3.min.js"></script>
	<script type="text/javascript" src="http://code.jquery.com/jquery-migrate-1.2.1.min.js"></script>
	<div id="page-wrapper">
		<script type="text/javascript">
  			(function() {
 			   var em = document.createElement('script'); em.type = 'text/javascript'; em.async = true;
   			   em.src = ('https:' == document.location.protocol ? 'https://za-ssl' : 'http://za-cdn') + '.effectivemeasure.net/em.js';
   			   var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(em, s);
			  })();
		</script>
		<noscript>
			<img src="//za.effectivemeasure.net/em_image" alt="" style="position:absolute; left:-5px;" />
		</noscript>
		<img src="http://za.effectivemeasure.net/em_image" alt="" width="1" height="1" />
		<div id="page-header">
							<a href="/"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/uploads/70/headerImage.wide_469x180.jpg" alt="Daily Sun Mobi" style="height:auto;width: 100%;" width="469" height="180" /></a>
				&nbsp; &nbsp; <a href="/news" class="back"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_landbou/img/back_64x64.gif" alt="Go back a page" width="64" height="64" /></a>
					</div>

		<div class="advertisement"><div align="center" style="font-size:small;font-style:normal;font-weight:normal;margin-top:3px;margin-bottom:3px"><a href="http://googleads.g.doubleclick.net/aclk?sa=L&amp;ai=CSxz4D8Y6U_LaEIP38gOYtIGoAseBpvEEAAAQASC3pasdUNaQ9eEGYMfVhI6cJsgBA6kC8nRgMfhSeT7gAgCoAwGqBKoBT9D3fVmaV1p2t5JvS6oqhJ12LqQ8rS6wXeuPuzEI5FuZFPwvI7VtvzhMxgitSAMcCJxDaLBGiIOhYkGU2JUjJCtud4-lHDwsMpUExmxYcQoweIRTi_TIDrN3YQRlbO3yQXnvVZLnaFB8fLhCCoPIR3-OD6n7GMXHVO6thS_NHAuXl9tb0llN51QySyIqYF49uNncouHB0zNd1RahIyC7igrWHbFsrr9ua5XgBAGgBhQ&amp;num=0&amp;sig=AOD64_3g2QH96T0i7tuFVMk6gSzA-vkXkg&amp;client=ca-pub-6255253699155664&amp;adurl=http://cuddlykoala.telesure.yomohost.co.za/campaign.php%3Fid%3D3997" target="_blank"><img src="http://pagead2.googlesyndication.com/simgad/3508678514699156351" alt="" border="0" height="50" width="300"></a><br/></div></div>
		
		<div id="content-wrapper">
			
                                    <p class="align-center">You are not currently logged in, <a href="/login">Login</a> or <a href="/register">Register</a></p>
                                
            
<h1 class="iconized"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/news_63x61.gif" alt="" width="63" height="61" />News articles</h1><div class="filter"><form action="/news" id="readForm" method="post" accept-charset="utf-8"><div style="display:none;"><input type="hidden" name="_method" value="POST" /></div><table>    <td class="submit"> </td></tr><tr>	<td class="field"><select name="data[category]" id="category">
<option value="">All categories</option>
<option value="2">News</option>
<option value="1440">Entertainment</option>
</select></td>	<td class="submit"><input type="submit" value="GO" /></td>    </tr></table></form></div><div class="article-fullview item-fullview">
	<h2 class="sub-heading">Cop fiddles and wants free sex shows, say magoshas</h2>
		<p class="publish-date"><strong>Published:</strong> Tuesday, 2014/04/01</p>

								<div class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_mobile/remote/www.hermanustimes.co.za/uploads/ArticlePhotos/135330/622b79a3-a1df-48ec-910e-b81b40237bac_the20Magoshas4_330x240_B.jpg" alt="" style="max-width:95%;height:auto;" width="330" height="240" /></div>
			
	<p><strong>THE magoshas say he is a <br />cruel cop who seems to enjoy making their tough lives even tougher.</strong></p>
<p><em>He doesn’t bonk them but he pushes his fingers inside them, he takes their money and he beats them.</em></p>
<p>HE EVEN FORCES THEM TO OPEN THEIR LEGS SO HE CAN TAKE PICTURES OF THEIR PRIVATE PARTS.</p>
<p>The magoshas who work in Tembisa, Ekurhuleni agree that the cop, a section manager at Olifantsfontein cop shop in Midrand, does things to them because he likes to humiliate and hurt them.</p>
<div id="U300447188694JkC" class="text">
<div class="p">An angry magosha told <em class="i">Daily Sun </em>it is not an easy job.</div>
<div class="p">“We don’t lie on a comfortable bed with nice sheets. We do the business in the bushes and we get thorns in our backsides,” she said.</div>
<div class="p">“We want this cop to leave us alone. We want him to be arrested for this.”</div>
<div class="p">A 36-year-old magosha told <em class="i">Daily Sun </em>this same cop was on TV before for dragging a magosha on the ground and beating her. A case was opened but the women got no help from the cops.</div>
<div class="p">“When we go to the station they tell us to go away because we are stinking or they arrest us,” she said.</div>
<div class="p">The women aged between 23 and 58 come from different parts of the country and charge between R15 and R50 a time.</div>
<div class="p">But the perverted cop shows no mercy. He takes the little bit of money they earn for himself and if he is in a bad mood, the women get beaten and spend a night in jail.</div>
<div class="p">A gogo magosha (58) who is supposed to be on pension said they do this work because they need the money. “We don’t do this for fun. We have kids to feed,” she said.</div>
<div class="p">“Its very embarrassing for someone my age to be opening my legs for a pervert to have a look at my private parts.”</div>
<div class="p">The magoshas complain that the cop makes them pay “tax” because they don’t pay it to the government.</div>
<div class="p">On Friday some of the women were arrested and some allegedly beaten by the cop and his friends.</div>
<div class="p">“If you are unlucky enough to get caught, your clothes and your customer’s clothes get burnt and the customer has to pay a R1 500 fine,” said one of the women.</div>
<div class="p">The women said the cop wants to own them so they can work for him and pay him.</div>
<div class="p">“This man is making our lives impossible. We don’t work for him,” said a magosha.</div>
<div class="p"><span id="U300445860050UqF" class="span">)</span> Olifantsfontein Police Station’s Colonel John Mosowe said no law enforcement members are allowed to turn away complainants.</div>
<div class="p">“I have advised the ladies to open a case. This case and the one opened last year will be formally investigated,” he said.</div>
</div>
	<p class="meta">
		<strong>By:</strong> SIBONGILE MABENA	</p>
	
			<div class="actions" style="text-align: center;">
			<a href="/news/comments/5227/cop-fiddles-and-wants-free-sex-shows-say-magoshas"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/view-comments_288x46.gif" alt="View comments" style="max-width:49%;height:auto;" width="288" height="46" /></a>		
		<a href="/news/comments/add/5227/cop-fiddles-and-wants-free-sex-shows-say-magoshas"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/add-comment_288x46.gif" alt="Add comment" style="max-width:49%;height:auto;" width="288" height="46" /></a>	</div>	
	<div class="social-sharing">
<a href="/send-to-friend?u=http%3A%2F%2Fdailysun.mobi%2Fnews%2Fread%2F5227%2Fcop-fiddles-and-wants-free-sex-shows-say-magoshas&amp;n=Cop+fiddles+and+wants+free+sex+shows%2C+say+magoshas&amp;t=news+article&amp;r=%2Fnews%2Fread%2F5227%2Fcop-fiddles-and-wants-free-sex-shows-say-magoshas"><img src="/mobile/cached/640x480/img/social/email_82x34.gif" alt="Email to a friend" style="max-width:35%;height:auto;" onclick="return xt_click(this,&#039;C&#039;,&#039;2&#039;,&#039;E-Mail&#039;,&#039;S&#039;)" width="82" height="34" /></a>&nbsp;<a href="http://m.facebook.com/sharer.php?u=http%3A%2F%2Fdailysun.mobi%2Fnews%2Fread%2F5227%2Fcop-fiddles-and-wants-free-sex-shows-say-magoshas&amp;t=Cop+fiddles+and+wants+free+sex+shows%2C+say+magoshas"><img src="/mobile/cached/640x480/img/social/facebook_35x34.gif" alt="Share on Facebook" style="max-width:15%;height:auto;" onclick="return xt_click(this,&#039;C&#039;,&#039;2&#039;,&#039;Facebook&#039;,&#039;S&#039;)" width="35" height="34" /></a>&nbsp;<a href="http://mobile.twitter.com/home?status=Cop+fiddles+and+wants+free+sex+shows%2C+say+magoshas http%3A%2F%2Fdailysun.mobi%2Fnews%2Fread%2F5227%2Fcop-fiddles-and-wants-free-sex-shows-say-magoshas"><img src="/mobile/cached/640x480/img/social/twitter_35x34.gif" alt="Tweet this!" style="max-width:15%;height:auto;" onclick="return xt_click(this,&#039;C&#039;,&#039;2&#039;,&#039;Twitter&#039;,&#039;S&#039;)" width="35" height="34" /></a>&nbsp;<a href="http://m.delicious.com/save?v=5&amp;noui&amp;title=Cop+fiddles+and+wants+free+sex+shows%2C+say+magoshas&amp;url=http%3A%2F%2Fdailysun.mobi%2Fnews%2Fread%2F5227%2Fcop-fiddles-and-wants-free-sex-shows-say-magoshas"><img src="/mobile/cached/640x480/img/social/delicious_35x34.gif" alt="Share on Delicious" style="max-width:15%;height:auto;" onclick="return xt_click(this,&#039;C&#039;,&#039;2&#039;,&#039;Delicious&#039;,&#039;S&#039;)" width="35" height="34" /></a></div>

    
            <p class="align-right"><a href="/news/read/5225/anc-has-confidence-in-the-president">Read next article</a></p>
    
</div>

	<div class="main-navigation">
					<table>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/home_63x61.gif" alt="" width="63" height="61" /></td>
						<td><a href="/">Home</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/news_63x61.gif" alt="" width="63" height="61" /></td>
						<td><a href="/news">News</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_mobile/remote/netlocal.mobi/uploads/70/custom_pages/custom_page-12-1368010836-usgje0bs03_63x62.png" alt="" width="63" height="62" /></td>
						<td><a href="/custom/custom_pages/index/12">Sport</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_uploads/navigation/tournaments/logo_64x64.png" alt="" width="64" height="64" /></td>
						<td><a href="/tournaments">PSL &amp; Cups</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/weather_63x61.gif" alt="" width="63" height="61" /></td>
						<td><a href="/weather">Weather</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/multimedia_63x61.gif" alt="" width="63" height="61" /></td>
						<td><a href="/galleries-and-videos">Galleries &amp; videos</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_mobile/remote/netlocal.mobi/uploads/70/custom_pages/custom_page-8-1370169748-05evw98ao3_64x42.png" alt="" width="64" height="42" /></td>
						<td><a href="/custom/custom_pages/index/8">SunPower</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_mobile/remote/netlocal.mobi/uploads/70/custom_pages/custom_page-41-1370282431-pp2cfr7sa_64x64.jpg" alt="" width="64" height="64" /></td>
						<td><a href="/custom/custom_pages/index/41">SunStuff</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_mobile/remote/netlocal.mobi/uploads/70/custom_pages/custom_page-28-1370168552-m5ww6ix6sa_64x64.jpg" alt="" width="64" height="64" /></td>
						<td><a href="/custom/custom_pages/index/28">Horoscopes</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
									<tr>
						<td class="image"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_mobile/remote/netlocal.mobi/uploads/70/custom_pages/custom_page-25-1369745616-4u3kbmdybi_63x62.png" alt="" width="63" height="62" /></td>
						<td><a href="/custom/custom_pages/index/25">Contact Daily Sun</a></td>
													<td class="image more"><img src="/mobile/cached/640x480/mnt/webshare/app_webroot_theme/site_dailysun/img/navigation/more_64x64.gif" alt="" width="64" height="64" /></td>
											</tr>
							</table>
			</div>


<div class="secondary-navigation">
 <table>
		<tr class="netlocal-divider">
			<td class="image" style="text-align:left">
						<a href="http://www.facebook.com/sadailysun"><img src="/mobile/cached/640x480/img/social/facebook_35x34.gif" alt="Visit our Facebook Page" onclick="return xt_click(this,&#039;C&#039;,&#039;2&#039;,&#039;Facebook&#039;,&#039;S&#039;)" width="35" height="34" /></a>			<a href="http://www.twitter.com/dailysunsa"><img src="/mobile/cached/640x480/img/social/twitter_35x34.gif" alt="Tweet this!" onclick="return xt_click(this,&#039;C&#039;,&#039;2&#039;,&#039;Twitter&#039;,&#039;S&#039;)" width="35" height="34" /></a>						</td>
		</tr>
</table>
</div>		</div>

		
		<div id="page-footer">
			<div class="footer-navigation">

    <a href="/">Home</a> | <a href="/news">News</a> | <a href="/weather">Weather</a> | <a href="/galleries-and-videos">Galleries &amp; videos</a> | <a href="/contact-us">Contact us</a> | <a href="/terms">Terms</a> | <a href="/register">Register</a> | <a href="/login">Login</a> | <a href="http://netlocal.mobi/">Netlocal</a></div>			<p class="copyright">
								Copyright &copy; 2014 <em>Daily Sun Mobi</em>.			</p>
		</div>

		
					<div><img src="/img/ga.php?utmac=MO-40664933-1&amp;utmn=1037940373&amp;guid=ON&amp;utmr=-&amp;utmp=%2Fnews%2Fread%2F5227%2Fcop-fiddles-and-wants-free-sex-shows-say-magoshas" alt="" width="1" height="1" /></div>
			</div>

            <div id="AT-Tag">
            <script type="text/javascript">
                <!--
                xtpage = "";
                Xt_param = 's=538012&di=0&idclient=&p='+xtpage;
                xtn2 = "2";
                xt_multc = "";
                //all the xi indicators (like "&x1=...&x2=....&x3=...")
                xt_an = ""; //user ID
                xt_ac = ""; //category ID
                //do not modify below
                if(window.xtparam!=null){window.xtparam+="&s2="+xtn2+"&ac="+xt_ac+"&an="+xt_an+xt_multc;}else{window.xtparam="&s2="+xtn2+"&ac="+xt_ac+"&an="+xt_an+xt_multc;};
                if (window.xtparam!=null){Xt_param+=xtparam;}
                try {Xt_r = top.document.referrer;}
                catch(e) {Xt_r = document.referrer; }
                Xt_h = new Date();
                Xt_i = '<img width="1" height="1" src="http://logw310.ati-host.net/hit.xiti?'+Xt_param;
                Xt_i += '&hl='+Xt_h.getHours()+'x'+Xt_h.getMinutes()+'x'+Xt_h.getSeconds();
                if(parseFloat(navigator.appVersion)>=4)
                {Xt_s=screen;Xt_i+='&r='+Xt_s.width+'x'+Xt_s.height+'x'+Xt_s.pixelDepth+'x'+Xt_s.colorDepth;}
                document.write(Xt_i+'&ref='+Xt_r.replace(/[<>"]/g, '').replace(/&/g, '$')+'" >');
                //-->
            </script>
        </div>
        <noscript>
            <div id="xiti-logo-noscript">
                <img width="1" height="1" src="http://logw310.ati-host.net/hit.xiti?s=538012&amp;amp;s2=2&amp;amp;p=&amp;amp;idclient=&amp;amp;di=0&amp;amp;ac=&amp;amp;an=&amp;amp;" alt="WebAnalytics - AT Internet" />
            </div>
        </noscript>
    
    <!-- 152.111.196.43 -->

	</body>
</html>

"""
        
        doc = Document()
        doc.url = 'http://dailysun.mobi/news/read/5227/cop-fiddles-and-wants-free-sex-shows-say-magoshas'
        self.crawler.extract(doc, html)

        self.maxDiff = None

        self.assertEqual(doc.title, 'Cop fiddles and wants free sex shows, say magoshas')
        self.assertEqual(doc.summary, None)
        self.assertEqual(doc.published_at.strftime('%d %m %Y'), '01 04 2014')
        self.assertEqual(doc.author.name, 'SIBONGILE MABENA')
        self.assertEqual(doc.medium.name, 'Daily Sun')

        self.assertEqual(doc.text, u'THE magoshas say he is a cruel cop who seems to enjoy making their tough lives even tougher.\n\nHe doesn\u2019t bonk them but he pushes his fingers inside them, he takes their money and he beats them.\n\nHE EVEN FORCES THEM TO OPEN THEIR LEGS SO HE CAN TAKE PICTURES OF THEIR PRIVATE PARTS.\n\nThe magoshas who work in Tembisa, Ekurhuleni agree that the cop, a section manager at Olifantsfontein cop shop in Midrand, does things to them because he likes to humiliate and hurt them.')
    def test_extract(self):
        html = """

<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head>
        <title>TimesLIVE - Print Article</title>
        <link rel="stylesheet" href="http://www.timeslive.co.za/template/common/css/print.css" type="text/css" media="print" />
        <link type="text/css" rel="stylesheet" href="http://www.timeslive.co.za/template/common/css/uniform.default.css" />
        <script type="text/javascript">
            function printpage()
            {
                window.print();
            }
        </script>
    </head>
    <body onload="printpage()">
        <a href="#" onclick="javascript:print();">
            Print this page
        </a><br />
        <div class="container">
            <div class="clear"></div>
            <div id="content">
                <div class="span-24 column">
                    <div class="articleheader">
                        <h1>IEC's Tlakula must resign: opposition parties</h1>
                        <div> Apr 1, 2014 | Sapa</div>
                        <h3>Several opposition parties met in Pretoria on Tuesday to discuss Public Protector Thuli Madonsela's finding on the Nkandla upgrades and the controversy around IEC chairwoman Pansy Tlakula.</h3>
                    </div>

                    <div class="articlerelated">
                        <div class="image">
        <img width="290px" title="" alt="" src="http://www.timeslive.co.za/migration_catalog/ST/2009/09/10/26869_499542.jpg/RESIZED/Small/26869_499542.jpg">
         IEC chairwoman Pansy Tlakula</div>
    <br/>
<div class="quote">
                                <h3>
                                    <span>"</span>
                                    <p> </p><span class="end">"</span>
                                </h3>
                                <div class="clear"></div>
                            </div>
                        </div>
                    <p>Chairman of the multi-party forum Bantu Holomisa, who also heads the United Democratic Movement (UDM), said the opposition parties resolved to push for Tlakula's resignation.</p><p>"All the political parties present, with the exception of the Democratic Alliance and the Freedom Front Plus, agreed that advocate Tlakula must resign immediately.</p><p>"Should she refuse to resign, the parties who are in agreement will pursue legal action," said Holomisa.</p><p>The forum included the African Christian Democratic Party, AgangSA, Azapo, Economic Freedom Fighters, FF Plus, Inkatha Freedom Party, United Christian Democratic Party, and Holomisa's UDM.</p><p>Regarding Nkandla, the parties resolved to convene another summit after President Jacob Zuma had reacted to Parliament as ordered by Madonsela.</p><br/>
                    <br/>
                    <center>
                        ~ o O o ~
                    </center>
                </div>
            </div>
        </div>
    </body>
</html>
"""
        
        doc = Document()
        doc.url = 'http://www.timeslive.co.za/politics/2014/04/01/iec-s-tlakula-must-resign-opposition-parties'
        self.crawler.extract(doc, html)

        self.assertEqual(doc.title, u"IEC's Tlakula must resign: opposition parties")
        self.assertEqual(doc.summary, u"Several opposition parties met in Pretoria on Tuesday to discuss Public Protector Thuli Madonsela's finding on the Nkandla upgrades and the controversy around IEC chairwoman Pansy Tlakula.")
        self.assertEqual(doc.published_at.strftime('%d %m %Y'), '01 04 2014')
        self.assertEqual(doc.author.name, "Sapa")
        self.assertEqual(doc.medium.name, 'Times')
        
        self.assertEqual(doc.text, u'Chairman of the multi-party forum Bantu Holomisa, who also heads the United Democratic Movement (UDM), said the opposition parties resolved to push for Tlakula\'s resignation.\n\n"All the political parties present, with the exception of the Democratic Alliance and the Freedom Front Plus, agreed that advocate Tlakula must resign immediately.\n\n"Should she refuse to resign, the parties who are in agreement will pursue legal action," said Holomisa.\n\nThe forum included the African Christian Democratic Party, AgangSA, Azapo, Economic Freedom Fighters, FF Plus, Inkatha Freedom Party, United Christian Democratic Party, and Holomisa\'s UDM.\n\nRegarding Nkandla, the parties resolved to convene another summit after President Jacob Zuma had reacted to Parliament as ordered by Madonsela.')
Esempio n. 19
0
    def test_extract(self):
        self.doc_info = {
            "id": 1335083,
            "category": "motoring/industry-news",
            "title": "Cellphone crackdown - first busts",
            "published": "2012-07-05T12:10",
            "byline": "Murray Williams",
            "paragraphs": [
                "The first cellphone confiscation has taken place in Cape Town, as the new law was enforced for the first time. ",
                "As of today, drivers who are caught talking on their phones while driving, without headsets or hands-free kits, will have their handsets confiscated by traffic officers. ",
                "The first driver caught this morning was Jean-Benoit Biyoko, a taxi driver. Traffic officers nabbed him in Long Street in the CBD. ",
                "The traffic service&apos;s Maxine Jordaan reported: &#x201C;He didn&apos;t have a driving licence on him and he was taken to Gallows Hill traffic department. ",
                "&#x201C;His cellphone, a Nokia E63, had a SIM card and memory card in it, which he kept, and he kept his pouch too. ",
                "<strong>&#x201C;The gentleman was very co-operative and said he was sorry.&#x201D;</strong> ",
                "Jordaan said Biyoko had been fined R500 for talking on his cellphone while driving and would be permitted to collect his phone 24 hours after it was confiscated - on Friday morning. ",
                "The new City law was to be enforced across the city this afternoon, with officers from the undercover &apos;Ghost Squad&apos;, and other officers, deployed on major commuter routes. ",
                "Officers&apos; vehicles will carry special boxes, in which confiscated phones will be placed once they have been logged and sealed in protective pouches. They will then be stored in the traffic department&apos;s safe at Gallows Hill. ",
                "<strong>No fee is required to reclaim a confiscated phone.</strong> ",
                "The bylaw was introduced by safety and security Mayco member JP Smith, who has received praise from across the country for the action against drivers who continue to flout the law. ",
                "Smith said camera and video evidence would be used whenever possible to back up officers&apos; observations. ",
                "&#x201C;We&apos;re hoping that everybody will finally get the message, grab those hands-free kits and start using cellphones legally&#x201D;, Smith said. ",
                "&#x201C;We issue a minimum of 8000 fines a month for illegal cellphone use while driving. But it&apos;s not changing behaviour, so we must find a more powerful disincentive. ",
                "&#x201C;Illegal cellphone use is classified as &apos;distracted driving&apos; and is one of the four ost dangerous driving habits, with speeding, drinking and driving and not wearing seatbelts.&#x201D; - Cape Argus ",
                '<a href="http://www.iol.co.za/newsletters" target="_blank">Motoring newsletter - click here to keep up to speed with the best in motoring</a>',
                "<hr>",
            ],
            "images": [
                {
                    "id": "iol-mot-pic-jul5-cell-phone-impoundment-1-1.1335082/2802609312",
                    "credit": "INLSA",
                    "caption": "Taxi driver Jean-Benoit Biyoko was caught talking on his phone while driving in Long Street. Picture: Henk Kruger",
                    "url": "http://www.iol.co.za/polopoly_fs/iol-mot-pic-jul5-cell-phone-impoundment-1-1.1335082!/image/2802609312.jpg_gen/derivatives/box_300/2802609312.jpg",
                }
            ],
            "relatedStories": [
                {
                    "link": "http://www.iol.co.za/motoring/industry-news/cellphone-use-it-you-ll-lose-it-1.1333846",
                    "title": "Cellphone: Use it, you'll lose it!",
                },
                {
                    "link": "http://www.iol.co.za/news/south-africa/kwazulu-natal/kzn-drivers-could-also-lose-phones-1.1333884",
                    "title": "KZN drivers could also lose phones",
                },
            ],
            "description": "Cape Town cops have begun confiscating cellphones from drivers not using hands-free kits.",
        }

        doc = Document()
        doc.url = "http://www.iol.co.za/motoring/industry-news/cellphone-crackdown-first-busts-1.1335083#.UzvU562SxWs"

        self.crawler.extract(doc, None)

        self.assertEqual(doc.title, "Cellphone crackdown - first busts")
        self.assertEqual(
            doc.summary, "Cape Town cops have begun confiscating cellphones from drivers not using hands-free kits."
        )
        self.assertEqual(doc.published_at.strftime("%d %m %Y"), "05 07 2012")
        self.assertEqual(doc.author.name, "Murray Williams")
        self.assertEqual(doc.medium.name, "IOL")

        self.assertEqual(
            doc.text,
            u"The first cellphone confiscation has taken place in Cape Town, as the new law was enforced for the first time.\n\nAs of today, drivers who are caught talking on their phones while driving, without headsets or hands-free kits, will have their handsets confiscated by traffic officers.\n\nThe first driver caught this morning was Jean-Benoit Biyoko, a taxi driver. Traffic officers nabbed him in Long Street in the CBD.\n\nThe traffic service's Maxine Jordaan reported: \u201cHe didn't have a driving licence on him and he was taken to Gallows Hill traffic department.\n\n\u201cHis cellphone, a Nokia E63, had a SIM card and memory card in it, which he kept, and he kept his pouch too.\n\n\u201cThe gentleman was very co-operative and said he was sorry.\u201d\n\nJordaan said Biyoko had been fined R500 for talking on his cellphone while driving and would be permitted to collect his phone 24 hours after it was confiscated - on Friday morning.\n\nThe new City law was to be enforced across the city this afternoon, with officers from the undercover 'Ghost Squad', and other officers, deployed on major commuter routes.\n\nOfficers' vehicles will carry special boxes, in which confiscated phones will be placed once they have been logged and sealed in protective pouches. They will then be stored in the traffic department's safe at Gallows Hill.\n\nNo fee is required to reclaim a confiscated phone.\n\nThe bylaw was introduced by safety and security Mayco member JP Smith, who has received praise from across the country for the action against drivers who continue to flout the law.\n\nSmith said camera and video evidence would be used whenever possible to back up officers' observations.\n\n\u201cWe're hoping that everybody will finally get the message, grab those hands-free kits and start using cellphones legally\u201d, Smith said.\n\n\u201cWe issue a minimum of 8000 fines a month for illegal cellphone use while driving. But it's not changing behaviour, so we must find a more powerful disincentive.\n\n\u201cIllegal cellphone use is classified as 'distracted driving' and is one of the four ost dangerous driving habits, with speeding, drinking and driving and not wearing seatbelts.\u201d - Cape Argus\n\nMotoring newsletter - click here to keep up to speed with the best in motoring\n\n",
        )
    def test_extract(self):
        html = """

<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head>
        <title>TimesLIVE - Print Article</title>
        <link rel="stylesheet" href="http://www.timeslive.co.za/template/common/css/print.css" type="text/css" media="print" />
        <link type="text/css" rel="stylesheet" href="http://www.timeslive.co.za/template/common/css/uniform.default.css" />
        <script type="text/javascript">
            function printpage()
            {
                window.print();
            }
        </script>
    </head>
    <body onload="printpage()">
        <a href="#" onclick="javascript:print();">
            Print this page
        </a><br />
        <div class="container">
            <div class="clear"></div>
            <div id="content">
                <div class="span-24 column">
                    <div class="articleheader">
                        <h1>IEC's Tlakula must resign: opposition parties</h1>
                        <div> Apr 1, 2014 | Sapa</div>
                        <h3>Several opposition parties met in Pretoria on Tuesday to discuss Public Protector Thuli Madonsela's finding on the Nkandla upgrades and the controversy around IEC chairwoman Pansy Tlakula.</h3>
                    </div>

                    <div class="articlerelated">
                        <div class="image">
        <img width="290px" title="" alt="" src="http://www.timeslive.co.za/migration_catalog/ST/2009/09/10/26869_499542.jpg/RESIZED/Small/26869_499542.jpg">
         IEC chairwoman Pansy Tlakula</div>
    <br/>
<div class="quote">
                                <h3>
                                    <span>"</span>
                                    <p> </p><span class="end">"</span>
                                </h3>
                                <div class="clear"></div>
                            </div>
                        </div>
                    <p>Chairman of the multi-party forum Bantu Holomisa, who also heads the United Democratic Movement (UDM), said the opposition parties resolved to push for Tlakula's resignation.</p><p>"All the political parties present, with the exception of the Democratic Alliance and the Freedom Front Plus, agreed that advocate Tlakula must resign immediately.</p><p>"Should she refuse to resign, the parties who are in agreement will pursue legal action," said Holomisa.</p><p>The forum included the African Christian Democratic Party, AgangSA, Azapo, Economic Freedom Fighters, FF Plus, Inkatha Freedom Party, United Christian Democratic Party, and Holomisa's UDM.</p><p>Regarding Nkandla, the parties resolved to convene another summit after President Jacob Zuma had reacted to Parliament as ordered by Madonsela.</p><br/>
                    <br/>
                    <center>
                        ~ o O o ~
                    </center>
                </div>
            </div>
        </div>
    </body>
</html>
"""

        doc = Document()
        doc.url = 'http://www.timeslive.co.za/politics/2014/04/01/iec-s-tlakula-must-resign-opposition-parties'
        self.crawler.extract(doc, html)

        self.assertEqual(doc.title,
                         u"IEC's Tlakula must resign: opposition parties")
        self.assertEqual(
            doc.summary,
            u"Several opposition parties met in Pretoria on Tuesday to discuss Public Protector Thuli Madonsela's finding on the Nkandla upgrades and the controversy around IEC chairwoman Pansy Tlakula."
        )
        self.assertEqual(doc.published_at.strftime('%d %m %Y'), '01 04 2014')
        self.assertEqual(doc.author.name, "Sapa")
        self.assertEqual(doc.medium.name, 'Times')

        self.assertEqual(
            doc.text,
            u'Several opposition parties met in Pretoria on Tuesday to discuss Public Protector Thuli Madonsela\'s finding on the Nkandla upgrades and the controversy around IEC chairwoman Pansy Tlakula.\n\nChairman of the multi-party forum Bantu Holomisa, who also heads the United Democratic Movement (UDM), said the opposition parties resolved to push for Tlakula\'s resignation.\n\n"All the political parties present, with the exception of the Democratic Alliance and the Freedom Front Plus, agreed that advocate Tlakula must resign immediately.\n\n"Should she refuse to resign, the parties who are in agreement will pursue legal action," said Holomisa.\n\nThe forum included the African Christian Democratic Party, AgangSA, Azapo, Economic Freedom Fighters, FF Plus, Inkatha Freedom Party, United Christian Democratic Party, and Holomisa\'s UDM.\n\nRegarding Nkandla, the parties resolved to convene another summit after President Jacob Zuma had reacted to Parliament as ordered by Madonsela.'
        )