Exemple #1
0
def fetch_news(rss_entry):
    import sanitizer
    from dbfrontend import News
    from urllib2 import urlopen
    from newsparse import NewsParserFactory
    sanitizer = sanitizer.Sanitizer()
    npf = NewsParserFactory()

    try:
        print "fetch_news " + rss_entry.url
        connection = urlopen(rss_entry.url, timeout=60)
        parser = npf.new(rss_entry.url)
        encoding = connection.headers.getparam('charset')
        content = connection.read().decode(encoding)
        content = sanitizer.remove_js(content)
        # content = sanitizer.remove_quotes(content)
        content = parser.parse(content)
        news = News(title=rss_entry.title,
                    body=content,
                    url=rss_entry.url,
                    date=rss_entry.date)
    except Exception, e:
        #LOG HERE
        print "fetch_news exception"
        print e
        return None
Exemple #2
0
def fetch_news(rss_entry):
    import sanitizer
    from dbfrontend import News
    from urllib2 import urlopen
    from newsparse import NewsParserFactory
    sanitizer = sanitizer.Sanitizer()
    npf = NewsParserFactory()

    try:
        print "fetch_news " + rss_entry.url
        connection = urlopen(rss_entry.url, timeout = 60)
        parser = npf.new(rss_entry.url)
        encoding = connection.headers.getparam('charset')
        content = connection.read().decode(encoding)
        content = sanitizer.remove_js(content)
       # content = sanitizer.remove_quotes(content)
        content = parser.parse(content)
        news = News(title=rss_entry.title, body=content,url=rss_entry.url, date=rss_entry.date)
    except Exception, e:
        #LOG HERE
        print "fetch_news exception"
        print e
        return None
Exemple #3
0
        elif "tvn24.pl" in link:
            return TVN24Parser()
        elif "rp.pl" in link or "rp0Bpl" in link or "ekonomia240Bpl" in link:
            return RPParser()
        if "gazeta" in link:
            return GazetaParser()
        else:
            raise UnknownSourceException(link)

if __name__=="__main__":
    from urllib2 import urlopen
    import sanitizer
    urls = ["http://gazeta.pl.feedsportal.com/c/32739/f/612804/s/1ada18a6/l/0L0Ssport0Bpl0Csport0C10H650A250H10A7987750HPlywanie0I0IMistrzostwa0IEuropy0Iw0ISzczecinie0Ina0I250Imetrowym0Bhtml/story01.htm",
"http://rss.feedsportal.com/c/32536/f/482351/s/1ada66c2/l/0L0Srp0Bpl0Cartykul0C2324970H7683220Bhtml/story01.htm",
"http://www.rp.pl/artykul/706292,768323.html",
"http://www.rp.pl/artykul/69991,767665.html",
"http://wiadomosci.gazeta.pl/wiadomosci/1,114881,10792683,Rottweilery_zagryzly_w_Czechach_swoja_wlascicielke.html?utm_source=RSS&utm_medium=RSS&utm_campaign=10199882",
"http://www.tvn24.pl/12692,1727370,0,1,polacy-biedniejsi-niz-przed-rokiem,wiadomosc.html",
"http://wiadomosci.onet.pl/kraj/represjonowani-w-stanie-wojennym-spotkali-sie-w-il,1,4963280,wiadomosc.html",
"http://wiadomosci.gazeta.pl/wiadomosci/1,114881,10793499,W_redakcji__Nowej_Gaziety__nie_dzialaja_telefony_i.html?utm_source=RSS&utm_medium=RSS&utm_campaign=10199882",
"http://www.tvn24.pl/12692,1727379,0,1,pakt-fiskalny-przelamie-kryzys-damy-rade,wiadomosc.html"]
    npf = NewsParserFactory()
    sanitizer = sanitizer.Sanitizer()
    for url in urls:
        connection = urlopen(url)
        parser = npf.new(url)
        encoding = connection.headers.getparam('charset')
        content = connection.read().decode(encoding)
        content = sanitizer.remove_js(content)
        print("\n\n" + url + "\n" + parser.parse(content))