def fetch_news(rss_entry): import sanitizer from dbfrontend import News from urllib2 import urlopen from newsparse import NewsParserFactory sanitizer = sanitizer.Sanitizer() npf = NewsParserFactory() try: print "fetch_news " + rss_entry.url connection = urlopen(rss_entry.url, timeout=60) parser = npf.new(rss_entry.url) encoding = connection.headers.getparam('charset') content = connection.read().decode(encoding) content = sanitizer.remove_js(content) # content = sanitizer.remove_quotes(content) content = parser.parse(content) news = News(title=rss_entry.title, body=content, url=rss_entry.url, date=rss_entry.date) except Exception, e: #LOG HERE print "fetch_news exception" print e return None
def fetch_news(rss_entry): import sanitizer from dbfrontend import News from urllib2 import urlopen from newsparse import NewsParserFactory sanitizer = sanitizer.Sanitizer() npf = NewsParserFactory() try: print "fetch_news " + rss_entry.url connection = urlopen(rss_entry.url, timeout = 60) parser = npf.new(rss_entry.url) encoding = connection.headers.getparam('charset') content = connection.read().decode(encoding) content = sanitizer.remove_js(content) # content = sanitizer.remove_quotes(content) content = parser.parse(content) news = News(title=rss_entry.title, body=content,url=rss_entry.url, date=rss_entry.date) except Exception, e: #LOG HERE print "fetch_news exception" print e return None
elif "tvn24.pl" in link: return TVN24Parser() elif "rp.pl" in link or "rp0Bpl" in link or "ekonomia240Bpl" in link: return RPParser() if "gazeta" in link: return GazetaParser() else: raise UnknownSourceException(link) if __name__=="__main__": from urllib2 import urlopen import sanitizer urls = ["http://gazeta.pl.feedsportal.com/c/32739/f/612804/s/1ada18a6/l/0L0Ssport0Bpl0Csport0C10H650A250H10A7987750HPlywanie0I0IMistrzostwa0IEuropy0Iw0ISzczecinie0Ina0I250Imetrowym0Bhtml/story01.htm", "http://rss.feedsportal.com/c/32536/f/482351/s/1ada66c2/l/0L0Srp0Bpl0Cartykul0C2324970H7683220Bhtml/story01.htm", "http://www.rp.pl/artykul/706292,768323.html", "http://www.rp.pl/artykul/69991,767665.html", "http://wiadomosci.gazeta.pl/wiadomosci/1,114881,10792683,Rottweilery_zagryzly_w_Czechach_swoja_wlascicielke.html?utm_source=RSS&utm_medium=RSS&utm_campaign=10199882", "http://www.tvn24.pl/12692,1727370,0,1,polacy-biedniejsi-niz-przed-rokiem,wiadomosc.html", "http://wiadomosci.onet.pl/kraj/represjonowani-w-stanie-wojennym-spotkali-sie-w-il,1,4963280,wiadomosc.html", "http://wiadomosci.gazeta.pl/wiadomosci/1,114881,10793499,W_redakcji__Nowej_Gaziety__nie_dzialaja_telefony_i.html?utm_source=RSS&utm_medium=RSS&utm_campaign=10199882", "http://www.tvn24.pl/12692,1727379,0,1,pakt-fiskalny-przelamie-kryzys-damy-rade,wiadomosc.html"] npf = NewsParserFactory() sanitizer = sanitizer.Sanitizer() for url in urls: connection = urlopen(url) parser = npf.new(url) encoding = connection.headers.getparam('charset') content = connection.read().decode(encoding) content = sanitizer.remove_js(content) print("\n\n" + url + "\n" + parser.parse(content))