def pipe_fetch(context=None, _INPUT=None, conf=None, **kwargs): """Fetches and parses one or more feeds to yield the feed entries. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url Yields (_OUTPUT): feed entries """ conf = DotDict(conf) urls = util.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = util.get_value(DotDict(item_url), DotDict(item), **kwargs) url = util.get_abspath(url) if not url: continue if context and context.verbose: print "pipe_fetch loading:", url parsed = feedparser.parse(urlopen(url).read()) for entry in util.gen_entries(parsed): yield entry if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses the first feed found on one or more sites. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if context and context.verbose: print "pipe_fetchsitefeed loading:", url for link in autorss.getRSSLink(url.encode('utf-8')): parsed = speedparser.parse(urlopen(link).read()) for entry in utils.gen_entries(parsed): yield entry if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def parse_as_feed(markup): # TODO: do some fail-fast checks for rss-feedliness to avoid parsing the whole document parsed = speedparser.parse(markup) # print 'PARSED:', parsed if parsed: if len(parsed.get('entries', [])) > 0: return parsed
def test_xmlns_space_support(self): from os import path import ipdb; ipdb.set_trace(); feed = open(path.join(path.dirname(__file__), "test-feeds/co.atom")).read() res = parse(feed) self.assertTrue(res.bozo == 0) self.assertTrue(len(res.entries) == 3)
def parse_feed_parallel(num, feed_options_item, all_links, queue, t_limit=None): """ Parallel creation of a RSSItem for each post in the feed. :param num: The feed's number in the list. For DEBUG purposes :param feed_options_item: The RSS Feed options :param all_links: A set of all the links in the database :param queue: A Queue to store the resulting RSSPost objects :param t_limit: An integer used to limit the number of running threads """ t1 = millis() # Read the feed XML and store it as a string try: a = urllib.urlopen(feed_options_item.feed_url).read() except IOError as e: logger.error("Getting XML for feed %s failed. No posts from this feed will be processed" % feed_options_item.feed_url) return d = speedparser.parse(a, clean_html=False) # SpeedParser is ~10 times faster than FeedParser t2 = millis() logger.debug("%d %s with %d posts, SpeedParser done in: %d ms" % (num, feed_options_item.feed_url, len(d.entries), (t2-t1))) # Create a thread for each entry in the feed which is not present in the database threads = [] http = None if 'feedburner' in feed_options_item.feed_url: # Get the host of the first original link http = urllib3.connection_from_url(d.entries[0].get("id", d.entries[0].link), maxsize=40, block=True) else: # Got maxsize=40 experimentally as best value http = urllib3.connection_from_url(feed_options_item.feed_url, maxsize=40, block=True) # Fill threads list for entry in d.entries: if 'feedproxy.google' in entry.link: # FeedProxy workaround if entry.get("id", entry.link) not in all_links: threads.append(threading.Thread(target=get_html3, args=(http, entry, feed_options_item, queue))) else: if entry.link not in all_links: threads.append(threading.Thread(target=get_html3, args=(http, entry, feed_options_item, queue))) # Run threads depending on thread limit if t_limit: for i in range(0, len(threads), t_limit): for j in range(min(t_limit, len(threads) - i)): threads[i+j].start() for j in range(min(t_limit, len(threads) - i)): threads[i+j].join() # If t_limit is None, run all threads at once else: for t in threads: t.start() for t in threads: t.join()
def _test_speedparse(): """ >>> test_speedparse() ['feed', 'bozo', 'version', 'encoding', 'entries'] """ data = urlopen("http://pypi.python.org/pypi?%3Aaction=rss").read() print(parse(data).keys())
def test_text_heart_parser_error(self): """This is a placeholder test. LXML punts because the title trips an unrecoverable parser error, and we have no way of cleaning it. This would be a big issue, but FeedParser apparently cannot figure this out either as it breaks SAXParser.""" import feedparser feed = """<?xml version="1.0" encoding="UTF-8"?><rss version="2"><channel><title><3</title><link>http://canoe.org.au</link><description>Latest News</description><language>en</language><ttl>480</ttl><pubDate>Sat, 21 Jan 2012 14:00:02 UTC</pubDate><item><title><3</title><link>http://canoe.org.au/default.asp?Page=23196</link><description>Kayak for Kids is a unique paddling challenge on beautiful Sydney Harbour for everyone from beginner to serious kayaker.</description><enclosure url="http://canoe.org.au/site/canoeing/image/fullsize/35576.jpg" type="image/jpeg" /><pubDate>Thu, 19 Jan 2012 14:00:00 UTC</pubDate><guid>http://canoe.org.au/default.asp?Page=23196</guid></item></channel></rss>""" self.assertTrue(parse(feed).bozo == 1) self.assertTrue(feedparser.parse(feed).bozo == 0)
def test_xmlns_space_support(self): from os import path import ipdb ipdb.set_trace() feed = open(path.join(path.dirname(__file__), "test-feeds/co.atom")).read() res = parse(feed) self.assertTrue(res.bozo == 0) self.assertTrue(len(res.entries) == 3)
def test_non_cleaned_title(self): """This tests for a bug where titles were not stripped of html despite a cleaner being supplied to speedparser.""" from lxml.html.clean import Cleaner feed = '''<?xml version="1.0"?><feed xmlns="http://www.w3.org/2005/Atom"><title>scribble.yuyat.jp</title><link href="http://scribble.yuyat.jp/"/><link type="application/atom+xml" rel="self" href="http://scribble.yuyat.jp/atom.xml"/><updated>2012-01-08T18:34:39-08:00</updated><id>http://scribble.yuyat.jp/</id><author><name>Yuya Takeyama</name></author><entry><id>http://scribble.yuyat.jp/2012/01/07/this-is-just-a-scribble</id><link type="text/html" rel="alternate" href="http://scribble.yuyat.jp/2012/01/07/this-is-just-a-scribble.html"/><title>scribble 始めます <script>alert(1)</script></title><updated>2012-01-07T00:00:00-08:00</updated><author><name>Yuya Takeyama</name></author><content type="html"><p>今まで書いて来た <a href='http://blog.yuyat.jp/'>Born Too Late</a> の住み分けとしては, あっちがいろいろ調べてからまとめる用, こっちはもっと殴り書いていく感じにしたい.</p><div class='highlight'><pre><code class='ruby'><span class='lineno'>1</span> <span class='k'>class</span> <span class='nc'>Foo</span><span class='lineno'>2</span> <span class='k'>def</span> <span class='nf'>bar</span><span class='lineno'>3</span> <span class='ss'>:baz</span><span class='lineno'>4</span> <span class='k'>end</span><span class='lineno'>5</span> <span class='k'>end</span></code></pre></div></content></entry></feed>''' cleaner = Cleaner(comments=True, javascript=True, scripts=True, safe_attrs_only=True, page_structure=True, style=True, embedded=False, remove_tags=['body']) result = parse(feed, unix_timestamp=True, clean_html=cleaner) self.assertTrue('bozo_exception' not in result, str(result)) for e in result.entries: self.assertTrue('alert(1)' not in e.title, e.title) self.assertTrue(not e.title.startswith('<p>'), e.title)
def do_feed(config): try: if DEBUG: print("pulling url: {}".format(config['url'])) req = requests.get(config['url'], timeout=20) if DEBUG: print("pulled") #print(req.content) except requests.exceptions.ReadTimeout: if 'bitmetv' not in config['url'] and 'portlandtribune' not in config['url']: print("URL timeout: " + config['url']) return except requests.exceptions.ConnectionError: if 'baconbits' not in config['url']: print("URL connection fail: " + config['url']) return except ssl.SSLError: print("SSL URL connection fail: " + config['url']) return feed = speedparser.parse(req.content, clean_html=True, encoding='UTF-8') entries = feed['entries'] #print("entries: " + str(entries)[:100]) for filterset in config.get('filter', []): filter_type, filter_rules = filterset.popitem() if filter_type == 'include': entries = filter_include(entries, filter_rules) elif filter_type == 'exclude': entries = filter_exclude(entries, filter_rules) elif filter_type == 'transform': #print "transforming, rules: " + str(filter_rules) #print("transforming, entries: " + str(entries)[:100]) entries = transform(entries, filter_rules) else: raise Exception("can only handle include/exclude filter types. being asked to process %s" % filter_type) #pars = HTMLParser() items = [] # convert the entries to RSSItems, build the list we'll stick in the RSS.. for entry in entries: #print(html.unescape(entry.get('title', '').encode('utf-8'))) item = PyRSS2Gen.RSSItem( title = html.unescape(entry.get('title', '')), link = html.unescape(entry.get('link', '')), description = html.unescape(entry.get('description', '')), author = html.unescape(entry.get('author', '')), categories = entry.get('categories'), comments = html.unescape(entry.get('comments', '')), enclosure = entry.get('enclosure'), guid = entry.get('guid'), pubDate = entry.get('pubDate'), source = entry.get('source'), ) items.append(item) #print("xx", html.unescape(feed['feed'].get('title', ''))) #print(html.unescape(feed['feed'].get('link', ''))) #print(config['output']) rss = PyRSS2Gen.RSS2( title = html.unescape(feed['feed'].get('title', '')), link = html.unescape(feed['feed'].get('link', '')), description = html.unescape(feed['feed'].get('description', '')), pubDate = feed['feed'].get('pubDate'), lastBuildDate = feed['feed'].get('lastBuildDate'), categories = feed['feed'].get('categories'), ttl = feed['feed'].get('ttl'), image = feed['feed'].get('image'), items = items ) rssfile = BytesIO() rss.write_xml(rssfile) rssfile.seek(0) return rssfile
def test_nonetype_no_strip_regression(self): """This tests for a bug in 0.1.6 where the strip_outer_tag function would be called on None and raise an exception.""" feed = """<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"><channel><title>Instapaper: Starred</title><link>http://www.instapaper.com/starred</link><description></description><item><title>Toronto News: Flipped Junction homes taken on a wild real estate ride ending in fraud allegations - thestar.com</title><link>http://www.thestar.com/news/article/1111810--flipped-junction-homes-taken-on-a-wild-real-estate-ride-ending-in-fraud-allegations</link><description></description><pubDate>Sat, 07 Jan 2012 18:46:18 EST</pubDate></item></channel></rss>""" self.assertTrue(parse(feed).feed.title == "Instapaper: Starred")
def test_invalid_entity_recovery(self): feed = """<?xml version="1.0"?><rss xmlns:itunes="http://www.itunes.com/DTDs/Podcast-1.0.dtd" version="2.0"><channel><title>Faith Promise Church Podcast</title><description>Weekly message Podcast from Faith Promise Church. Faith Promise church is an exciting church located in Knoxville, Tennessee. For information about the church, please visit our website at faithpromise.org. We hope you enjoy and are blessed by our podcast.</description><link>http://faithpromise.org</link><language>en-us</language><item><title>T C & B (Taking Care of Busine</title><link>http://faithpromise.org/media/20111112-13.mp3</link><description>T C & B (Taking Care of Busine - Faith Promise Church Podcasts - Dr. Chris Stephens</description><pubDate>Mon, 14 Nov 2011 11:53:23 -0500</pubDate><enclosure url="http://faithpromise.org/media/20111112-13.mp3" length="36383475" type="audio/mpeg"/></item></channel></rss>""" self.assertTrue(parse(feed).bozo == 0) self.assertTrue(len(parse(feed).entries) == 1)
def test_rdf_rss_090_support(self): feed = """<?xml version="1.0" encoding="utf-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"><channel><title>heise online News</title><link>http://www.heise.de/newsticker/</link><description>Nachrichten nicht nur aus der Welt der Computer</description></channel><item><title>Am 6. Juni ist World IPv6 Launch Day</title><link>http://www.heise.de/newsticker/meldung/Am-6-Juni-ist-World-IPv6-Launch-Day-1415071.html/from/rss09</link><description>Am 6. Juni 2012 veranstaltet die Internet Society den IPv6 World Launch Day, an dem teilnehmende Internet Service Provider, Netzwerkhersteller und Service-Anbieter dauerhaft IPv6 schalten werden.</description></item></rdf:RDF>""" self.assertTrue(parse(feed).bozo == 0) self.assertTrue(len(parse(feed).entries) == 1)
def test_support_rss_version_2_no_zero(self): feed = """<?xml version="1.0" encoding="UTF-8"?><rss version="2"><channel><title>Australian Canoeing</title><link>http://canoe.org.au</link><description>Latest News</description><language>en</language><ttl>480</ttl><pubDate>Sat, 21 Jan 2012 14:00:02 UTC</pubDate><item><title>Lifestart Kayak for Kids 2012</title><link>http://canoe.org.au/default.asp?Page=23196</link><description>Kayak for Kids is a unique paddling challenge on beautiful Sydney Harbour for everyone from beginner to serious kayaker.</description><enclosure url="http://canoe.org.au/site/canoeing/image/fullsize/35576.jpg" type="image/jpeg" /><pubDate>Thu, 19 Jan 2012 14:00:00 UTC</pubDate><guid>http://canoe.org.au/default.asp?Page=23196</guid></item></channel></rss>""" self.assertTrue(parse(feed).bozo == 0) self.assertTrue(len(parse(feed).entries) == 1)
def test_detect_charsets(self): feed = """<?xml version="1.0" encoding="UTF-8"?> <rss xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0"><channel><title>"ismb2010" - Google Blogs�gning</title><link>http://www.google.com/search?hl=da&lr=&q=%22ismb2010%22&ie=utf-8&tbm=blg</link><description>S�geresultaterne <b>1</b> - <b>10</b> ud af ca. <b>59</b> for <b>&quot;ismb2010&quot;</b>.</description><opensearch:totalResults>59</opensearch:totalResults><opensearch:startIndex>1</opensearch:startIndex><opensearch:itemsPerPage>10</opensearch:itemsPerPage><item><title>Beyond DNA: <b>ISMB2010</b> Boston</title><link>http://xiazheng.blogspot.com/2010/07/ismb2010-boston.html</link><description>ISMB of this year was held at Boston on July 10-14. I&#39;m so happy to meet big guys in Bioinformatics whose papers I have ever read, especially Dr. Ratsch from MPI. One information this conference delivered this year is that <b>...</b></description><dc:publisher>Beyond DNA</dc:publisher><dc:creator>Zheng Xia</dc:creator><dc:date>Sat, 17 Jul 2010 13:56:00 GMT</dc:date></item></channel></rss>""" self.assertTrue(parse(feed, encoding=True).bozo == 0) self.assertTrue(len(parse(feed, encoding=True).entries) == 1)
def add(http, publisherid): total = 0 try: request = urllib2.Request(http) print "ok" request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0") readhttp = urllib2.urlopen(request, timeout=10).read() except: return 0 chard = chardet.detect(readhttp) if chard["encoding"] == u"GB18030": fp = feedparser.parse(readhttp) print "by feedparser" else: fp = speedparser.parse(readhttp, clean_html=False) print "by speedparser" # print ' OK' # print ' OK' text = os.path.split(os.path.realpath(sys.argv[0]))[0] text = text + "/db/reader.db" con = sqlite3.connect(text) con.text_factory = str cur = con.cursor() try: cur.execute( "CREATE TABLE reader(id integer primary key autoincrement,title,link,description,content,time,publisherid)" ) except: {} index = 0 # print fp.entries for entry in fp.entries: try: t = entry.published except: try: t = entry.updated except: t = time.strftime("%Y-%m-%d %X", time.localtime(time.time())) print "error" title = entry.title link = entry.link try: description = entry.description except: description = "" try: content = entry.content[0]["value"] except: content = "" if isextist(link, publisherid): con.execute( "insert into reader(title,link,description,content,time,publisherid) values(?,?,?,?,?,?)", (title, link, description, content, t, publisherid), ) print entry.title total = total + 1 con.commit() index = index + 1 cur.close() con.close() return total
def test_unix_timestamp_failure(self): """This tests for a bug where a non-existant timestamp is used to create a unix timestamp (from None) and throws an exception.""" feed = '<?xml version="1.0" encoding="UTF-8"?> \n<rss version="2.0"\n\txmlns:content="http://purl.org/rss/1.0/modules/content/"\n\txmlns:wfw="http://wellformedweb.org/CommentAPI/"\n\txmlns:dc="http://purl.org/dc/elements/1.1/"\n\txmlns:atom="http://www.w3.org/2005/Atom"\n\txmlns:sy="http://purl.org/rss/1.0/modules/syndication/"\n\txmlns:slash="http://purl.org/rss/1.0/modules/slash/"\n\t> \n \n<channel>\n<title>betamax - Svpply</title> \n\t<link>http://svpply.com</link> \n\t<description>Svpply is a retail bookmarking and recommendation service.</description> \n\t<lastBuildDate>1323107774</lastBuildDate> \n\t<language>en</language> \n\t<sy:updatePeriod>hourly</sy:updatePeriod> \n\t<sy:updateFrequency>1</sy:updateFrequency> \n\t</channel> \n</rss>' result = parse(feed, unix_timestamp=True) self.assertTrue('bozo_exception' not in result, str(result))
def parse_feed_parallel(num, feed_options_item, all_links, queue, t_limit=None): """ Parallel creation of a RSSItem for each post in the feed. :param num: The feed's number in the list. For DEBUG purposes :param feed_options_item: The RSS Feed options :param all_links: A set of all the links in the database :param queue: A Queue to store the resulting RSSPost objects :param t_limit: An integer used to limit the number of running threads """ t1 = millis() # Read the feed XML and store it as a string try: a = urllib.urlopen(feed_options_item.feed_url).read() except IOError as e: logger.error( "Getting XML for feed %s failed. No posts from this feed will be processed" % feed_options_item.feed_url) return d = speedparser.parse( a, clean_html=False) # SpeedParser is ~10 times faster than FeedParser t2 = millis() logger.debug("%d %s with %d posts, SpeedParser done in: %d ms" % (num, feed_options_item.feed_url, len(d.entries), (t2 - t1))) # Create a thread for each entry in the feed which is not present in the database threads = [] http = None if 'feedburner' in feed_options_item.feed_url: # Get the host of the first original link http = urllib3.connection_from_url(d.entries[0].get( "id", d.entries[0].link), maxsize=40, block=True) else: # Got maxsize=40 experimentally as best value http = urllib3.connection_from_url(feed_options_item.feed_url, maxsize=40, block=True) # Fill threads list for entry in d.entries: if 'feedproxy.google' in entry.link: # FeedProxy workaround if entry.get("id", entry.link) not in all_links: threads.append( threading.Thread(target=get_html3, args=(http, entry, feed_options_item, queue))) else: if entry.link not in all_links: threads.append( threading.Thread(target=get_html3, args=(http, entry, feed_options_item, queue))) # Run threads depending on thread limit if t_limit: for i in range(0, len(threads), t_limit): for j in range(min(t_limit, len(threads) - i)): threads[i + j].start() for j in range(min(t_limit, len(threads) - i)): threads[i + j].join() # If t_limit is None, run all threads at once else: for t in threads: t.start() for t in threads: t.join()