def eval(self, name): # read the test case try: testcase = open(testfiles % name) data = testcase.read() description, expect = self.desc_re.search(data).groups() testcase.close() except: raise RuntimeError, "can't parse %s" % name # parse and reconstitute to a string work = StringIO.StringIO() results = feedparser.parse(data) scrub(testfiles%name, results) reconstitute(results, results.entries[0]).writexml(work) # verify the results results = feedparser.parse(work.getvalue().encode('utf-8')) if 'illegal' not in name: self.assertFalse(results.bozo, 'xml is well formed') if not self.simple_re.match(expect): self.assertTrue(eval(expect, results.entries[0]), expect) else: lhs, rhs = self.simple_re.match(expect).groups() self.assertEqual(eval(rhs), eval(lhs, results.entries[0]))
def eval(self, name): # read the test case try: testcase = open(testfiles % name) data = testcase.read() description, expect = self.desc_re.search(data).groups() testcase.close() except: raise RuntimeError, "can't parse %s" % name # parse and reconstitute to a string work = StringIO.StringIO() results = feedparser.parse(data) scrub(testfiles % name, results) reconstitute(results, results.entries[0]).writexml(work) # verify the results results = feedparser.parse(work.getvalue().encode('utf-8')) if 'illegal' not in name: self.assertFalse(results.bozo, 'xml is well formed') if not self.simple_re.match(expect): self.assertTrue(eval(expect, results.entries[0]), expect) else: lhs, rhs = self.simple_re.match(expect).groups() self.assertEqual(eval(rhs), eval(lhs, results.entries[0]))
def test_scrub_xmlbase(self): base = feedparser.parse(feed) self.assertEqual('http://example.com/', base.entries[0].title_detail.base) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set('testfeed', 'xml_base', 'feed_alternate') data = deepcopy(base) scrub('testfeed', data) self.assertEqual('http://example.com/feed/', data.entries[0].title_detail.base) config.parser.set('testfeed', 'xml_base', 'entry_alternate') data = deepcopy(base) scrub('testfeed', data) self.assertEqual('http://example.com/entry/1/', data.entries[0].title_detail.base) config.parser.set('testfeed', 'xml_base', 'base/') data = deepcopy(base) scrub('testfeed', data) self.assertEqual('http://example.com/base/', data.entries[0].title_detail.base) config.parser.set('testfeed', 'xml_base', 'http://example.org/data/') data = deepcopy(base) scrub('testfeed', data) self.assertEqual('http://example.org/data/', data.entries[0].title_detail.base)
def test_spiderFeedUpdatedEntries(self): config.load(configfile) self.spiderFeed(testfeed % '4') self.assertEqual(2, len(glob.glob(workdir+"/*"))) data = feedparser.parse(workdir + '/planet.intertwingly.net,2006,testfeed4') self.assertEqual(u'three', data.entries[0].content[0].value)
def test_scrub_future(self): base = feedparser.parse(feed) self.assertEqual(1, len(base.entries)) self.assertTrue('updated' in base.entries[0].keys()) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set('testfeed', 'future_dates', 'ignore_date') data = deepcopy(base) scrub('testfeed', data) self.assertFalse('updated' in data.entries[0].keys()) config.parser.set('testfeed', 'future_dates', 'ignore_entry') data = deepcopy(base) scrub('testfeed', data) self.assertEqual(0, len(data.entries))
def test_scrub_future(self): base = feedparser.parse(feed) self.assertEqual(1, len(base.entries)) self.assertTrue(base.entries[0].has_key("updated")) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set("testfeed", "future_dates", "ignore_date") data = deepcopy(base) scrub("testfeed", data) self.assertFalse(data.entries[0].has_key("updated")) config.parser.set("testfeed", "future_dates", "ignore_entry") data = deepcopy(base) scrub("testfeed", data) self.assertEqual(0, len(data.entries))
def test_scrub_future(self): base = feedparser.parse(feed) self.assertEqual(1, len(base.entries)) self.assertTrue(base.entries[0].has_key('updated')) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set('testfeed', 'future_dates', 'ignore_date') data = deepcopy(base) scrub('testfeed', data) self.assertFalse(data.entries[0].has_key('updated')) config.parser.set('testfeed', 'future_dates', 'ignore_entry') data = deepcopy(base) scrub('testfeed', data) self.assertEqual(0, len(data.entries))
def fetch(self): info = feedparser.parse(self.url) if info.has_key("status"): self.url_status = str(info.status) elif info.has_key("entries") and len(info.entries)>0: self.url_status = str(200) elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout': self.url_status = str(408) else: self.url_status = str(500) self.info = info self.last_updated = self.updated self.updated = time.gmtime() logging.info("Fetch Complete(%s) for %s", self.url_status, self.url)
def test_scrub_type(self): base = feedparser.parse(feed) self.assertEqual('Föo', base.feed.author_detail.name) config.parser.readfp(StringIO.StringIO(configData)) data = deepcopy(base) scrub('testfeed', data) self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name) self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name) self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name) self.assertEqual('text/html', data.entries[0].title_detail.type) self.assertEqual('text/html', data.entries[0].summary_detail.type) self.assertEqual('text/html', data.entries[0].content[0].type)
def test_scrub_ignore(self): base = feedparser.parse(feed) self.assertTrue(base.entries[0].has_key('id')) self.assertTrue(base.entries[0].has_key('updated')) self.assertTrue(base.entries[0].has_key('updated_parsed')) self.assertTrue(base.entries[0].summary_detail.has_key('language')) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang') data = deepcopy(base) scrub('testfeed', data) self.assertFalse(data.entries[0].has_key('id')) self.assertFalse(data.entries[0].has_key('updated')) self.assertFalse(data.entries[0].has_key('updated_parsed')) self.assertFalse(data.entries[0].summary_detail.has_key('language'))
def verify_spiderPlanet(self): files = glob.glob(workdir+"/*") # verify that exactly eight files + 1 source dir were produced self.assertEqual(14, len(files)) # verify that the file names are as expected self.assertTrue(os.path.join(workdir, 'planet.intertwingly.net,2006,testfeed1,1') in files) self.assertTrue(os.path.join(workdir, 'planet.intertwingly.net,2006,testfeed2,1') in files) data = feedparser.parse(workdir + '/planet.intertwingly.net,2006,testfeed3,1') self.assertEqual(['application/rss+xml'], [link.type for link in data.entries[0].source.links if link.rel=='self']) self.assertEqual('three', data.entries[0].source.author_detail.name) self.assertEqual('three', data.entries[0].source['planet_css-id'])
def verify_spiderFeed(self): files = glob.glob(workdir+"/*") files.sort() # verify that exactly four files + one sources dir were produced self.assertEqual(5, len(files)) # verify that the file names are as expected self.assertTrue(os.path.join(workdir, 'planet.intertwingly.net,2006,testfeed1,1') in files) # verify that the file timestamps match atom:updated data = feedparser.parse(files[2]) self.assertEqual(['application/atom+xml'], [link.type for link in data.entries[0].source.links if link.rel=='self']) self.assertEqual('one', data.entries[0].source.planet_name) self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated) self.assertEqual(os.stat(files[2]).st_mtime, calendar.timegm(data.entries[0].updated_parsed))
def test_scrub_ignore(self): base = feedparser.parse(feed) self.assertTrue(base.entries[0].has_key("author")) self.assertTrue(base.entries[0].has_key("author_detail")) self.assertTrue(base.entries[0].has_key("id")) self.assertTrue(base.entries[0].has_key("updated")) self.assertTrue(base.entries[0].has_key("updated_parsed")) self.assertTrue(base.entries[0].summary_detail.has_key("language")) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set("testfeed", "ignore_in_feed", "author id updated xml:lang") data = deepcopy(base) scrub("testfeed", data) self.assertFalse(data.entries[0].has_key("author")) self.assertFalse(data.entries[0].has_key("author_detail")) self.assertFalse(data.entries[0].has_key("id")) self.assertFalse(data.entries[0].has_key("updated")) self.assertFalse(data.entries[0].has_key("updated_parsed")) self.assertFalse(data.entries[0].summary_detail.has_key("language"))
def test_scrub_ignore(self): base = feedparser.parse(feed) self.assertTrue('author' in base.entries[0].keys()) self.assertTrue('author_detail' in base.entries[0].keys()) self.assertTrue('id' in base.entries[0].keys()) self.assertTrue('updated' in base.entries[0].keys()) self.assertTrue('updated_parsed' in base.entries[0].keys()) self.assertTrue('language' in base.entries[0].summary_detail.keys()) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set('testfeed', 'ignore_in_feed', 'author id updated xml:lang') data = deepcopy(base) scrub('testfeed', data) self.assertFalse('author' in data.entries[0].keys()) self.assertFalse('author_detail' in data.entries[0].keys()) self.assertFalse('id' in data.entries[0].keys()) self.assertFalse('updated' in data.entries[0].keys()) self.assertFalse('updated_parsed' in data.entries[0].keys()) self.assertFalse('language' in data.entries[0].summary_detail.keys())
def verify_date(self, id, expected_date=None): file = os.path.join(workdir, id) # verify that the file exists self.assertTrue(os.path.exists(file), msg=file) data = feedparser.parse(file) # verify published & updated dates are in sync and match expected self.assertEqual(data.entries[0].updated, data.entries[0].published) # verify mtime is in sync self.assertEqual(time.gmtime(os.stat(file).st_mtime), data.entries[0].updated_parsed) self.assertEqual(time.gmtime(os.stat(file).st_mtime), data.entries[0].published_parsed) # verify meet hardcoded expectations if expected_date is not None: self.assertEqual(expected_date, data.entries[0].updated) return data.entries[0].updated
def test_scrub_xmlbase(self): base = feedparser.parse(feed) self.assertEqual("http://example.com/", base.entries[0].title_detail.base) config.parser.readfp(StringIO.StringIO(configData)) config.parser.set("testfeed", "xml_base", "feed_alternate") data = deepcopy(base) scrub("testfeed", data) self.assertEqual("http://example.com/feed/", data.entries[0].title_detail.base) config.parser.set("testfeed", "xml_base", "entry_alternate") data = deepcopy(base) scrub("testfeed", data) self.assertEqual("http://example.com/entry/1/", data.entries[0].title_detail.base) config.parser.set("testfeed", "xml_base", "base/") data = deepcopy(base) scrub("testfeed", data) self.assertEqual("http://example.com/base/", data.entries[0].title_detail.base) config.parser.set("testfeed", "xml_base", "http://example.org/data/") data = deepcopy(base) scrub("testfeed", data) self.assertEqual("http://example.org/data/", data.entries[0].title_detail.base)
def run(script, doc, output_file=None, options={}): """ process an Genshi template """ context = Context(**options) tmpl_fileobj = open(script) tmpl = MarkupTemplate(tmpl_fileobj, script) tmpl_fileobj.close() if not output_file: # filter context.push({'input':XMLParser(StringIO(doc))}) else: # template import time from planet import config,feedparser from planet.spider import filename # gather a list of subscriptions, feeds global subscriptions feeds = [] sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) data.feed.config = norm(dict(config.parser.items(sub))) if data.feed.has_key('link'): feeds.append((data.feed.config.get('name',''),data.feed)) subscriptions.append(norm(sub)) feeds.sort() # annotate each entry new_date_format = config.new_date_format() vars = feedparser.parse(StringIO(doc)) vars.feeds = [value for name,value in feeds] last_feed = None last_date = None for entry in vars.entries: entry.source.config = find_config(config, entry.source) # add new_feed and new_date fields entry.new_feed = entry.source.id entry.new_date = date = None if entry.has_key('published_parsed'): date=entry.published_parsed if entry.has_key('updated_parsed'): date=entry.updated_parsed if date: entry.new_date = time.strftime(new_date_format, date) # remove new_feed and new_date fields if not "new" if entry.new_date == last_date: entry.new_date = None if entry.new_feed == last_feed: entry.new_feed = None else: last_feed = entry.new_feed elif entry.new_date: last_date = entry.new_date last_feed = None # add streams for all text constructs for key in entry.keys(): if key.endswith("_detail") and entry[key].has_key('type') and \ entry[key].has_key('value'): streamify(entry[key],entry.source.planet_bozo) if entry.has_key('content'): for content in entry.content: streamify(content,entry.source.planet_bozo) # add cumulative feed information to the Genshi context vars.feed.config = dict(config.parser.items('Planet',True)) context.push(vars) # apply template output=tmpl.generate(context).render('xml') if output_file: out_file = open(output_file,'w') out_file.write(output) out_file.close() else: return output
def spiderFeed(self, feed_uri): feed_info = feedparser.parse('<feed/>') data = feedparser.parse(feed_uri) writeCache(feed_uri, feed_info, data)
# load config files (default: config.ini) for arg in sys.argv[1:]: config.load(arg) if len(sys.argv) == 1: config.load('config.ini') from Queue import Queue from threading import Thread # determine which subscriptions have no icon but do have a html page fetch_queue = Queue() html = ['text/html', 'application/xhtml+xml'] sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.get('icon'): continue if not data.feed.get('links'): continue for link in data.feed.links: if link.rel == 'alternate' and link.type in html: fetch_queue.put((sub, link.href)) break # find the favicon for a given webpage def favicon(page): parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom')) doc = parser.parse(urlopen(page)) favicon = urljoin(page, '/favicon.ico') for link in doc.getElementsByTagName('link'): if link.hasAttribute('rel') and link.hasAttribute('href'):
parser.set('Planet','template_files','themes/common/atom.xml.xslt') else: parser.set('Planet','template_files','tests/data/reconstitute.xslt') for name, value in zip(sys.argv[2::2],sys.argv[3::2]): parser.set(sys.argv[1], name.lstrip('-'), value) from planet import config config.parser = parser from planet import spider spider.spiderPlanet(only_if_new=False) from planet import feedparser for source in glob.glob(os.path.join(work, 'sources/*')): feed = feedparser.parse(source).feed if feed.has_key('title'): config.parser.set('Planet','name',feed.title_detail.value) if feed.has_key('link'): config.parser.set('Planet','link',feed.link) if feed.has_key('author_detail'): if feed.author_detail.has_key('name'): config.parser.set('Planet','owner_name',feed.author_detail.name) if feed.author_detail.has_key('email'): config.parser.set('Planet','owner_email',feed.author_detail.email) from planet import splice doc = splice.splice() sources = doc.getElementsByTagName('planet:source') if hide_planet_ns and len(sources) == 1:
def run(script, doc, output_file=None, options={}): """ process an Genshi template """ context = Context(**options) tmpl_fileobj = open(script) tmpl = MarkupTemplate(tmpl_fileobj, script) tmpl_fileobj.close() if not output_file: # filter context.push({'input':XMLParser(StringIO(doc))}) else: # template import time from planet import config,feedparser from planet.spider import filename # gather a list of subscriptions, feeds global subscriptions feeds = [] sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) data.feed.config = norm(dict(config.parser.items(sub))) if data.feed.has_key('link'): feeds.append((data.feed.config.get('name',''),data.feed)) subscriptions.append(norm(sub)) feeds.sort() # annotate each entry new_date_format = config.new_date_format() vars = feedparser.parse(StringIO(doc)) vars.feeds = [value for name,value in feeds] last_feed = None last_date = None for entry in vars.entries: entry.source.config = find_config(config, entry.source) # add new_feed and new_date fields if 'id' in entry.source: entry.new_feed = entry.source.id else: entry.new_feed = None entry.new_date = date = None if entry.has_key('published_parsed'): date=entry.published_parsed if entry.has_key('updated_parsed'): date=entry.updated_parsed if date: entry.new_date = time.strftime(new_date_format, date) # remove new_feed and new_date fields if not "new" if entry.new_date == last_date: entry.new_date = None if entry.new_feed == last_feed: entry.new_feed = None else: last_feed = entry.new_feed elif entry.new_date: last_date = entry.new_date last_feed = None # add streams for all text constructs for key in entry.keys(): if key.endswith("_detail") and entry[key].has_key('type') and \ entry[key].has_key('value'): streamify(entry[key],entry.source.planet_bozo) if entry.has_key('content'): for content in entry.content: streamify(content,entry.source.planet_bozo) # add cumulative feed information to the Genshi context vars.feed.config = dict(config.parser.items('Planet',True)) context.push(vars) # apply template output=tmpl.generate(context).render('xml') if output_file: out_file = open(output_file,'w') out_file.write(output) out_file.close() else: return output
def template_info(source): """ get template information from a feedparser output """ # wire in support for planet:source, call feedparser, unplug planet:source mixin=feedparser._FeedParserMixin mixin._start_planet_source = mixin._start_source mixin._end_planet_source = \ new.instancemethod(_end_planet_source, None, mixin) data=feedparser.parse(source) del mixin._start_planet_source del mixin._end_planet_source # apply rules to convert feed parser output to htmltmpl input output = {'Channels': [], 'Items': []} output.update(tmpl_mapper(data.feed, Base)) sources = [(source.get('planet_name',None),source) for source in data.feed.get('sources',[])] sources.sort() for name, feed in sources: output['Channels'].append(tmpl_mapper(feed, Base)) for entry in data.entries: output['Items'].append(tmpl_mapper(entry, Items)) # synthesize isPermaLink attribute for item in output['Items']: if item.get('id') == item.get('link'): item['guid_isPermaLink']='true' else: item['guid_isPermaLink']='false' # feed level information output['generator'] = config.generator_uri() output['name'] = config.name() output['link'] = config.link() output['owner_name'] = config.owner_name() output['owner_email'] = config.owner_email() if config.feed(): output['feed'] = config.feed() output['feedtype'] = config.feed().find('rss')>=0 and 'rss' or 'atom' # date/time information date = time.gmtime() output['date'] = PlanetDate(date) output['date_iso'] = Rfc3399(date) output['date_822'] = Rfc822(date) # remove new_dates and new_channels that aren't "new" date = channel = None for item in output['Items']: if item.has_key('new_date'): if item['new_date'] == date: del item['new_date'] else: date = item['new_date'] if item.has_key('new_channel'): if item['new_channel'] == channel and not item.has_key('new_date'): del item['new_channel'] else: channel = item['new_channel'] return output