def test_expunge(self): config.load(configfile) # create test entries in cache with correct timestamp for entry in glob.glob(testentries): e=minidom.parse(entry) e.normalize() eid = e.getElementsByTagName('id') efile = filename(workdir, eid[0].childNodes[0].nodeValue) eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated)) if not eid or not eupdated: continue shutil.copyfile(entry, efile) os.utime(efile, (emtime, emtime)) # create test feeds in cache sources = config.cache_sources_directory() for feed in glob.glob(testfeeds): f=minidom.parse(feed) f.normalize() fid = f.getElementsByTagName('id') if not fid: continue ffile = filename(sources, fid[0].childNodes[0].nodeValue) shutil.copyfile(feed, ffile) # verify that exactly nine entries + one source dir were produced files = glob.glob(workdir+"/*") self.assertEqual(10, len(files)) # verify that exactly four feeds were produced in source dir files = glob.glob(sources+"/*") self.assertEqual(4, len(files)) # expunge... expungeCache() # verify that five entries and one source dir are left files = glob.glob(workdir+"/*") self.assertEqual(6, len(files)) # verify that the right five entries are left self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test1,1') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test2,1') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test3,3') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test4,2') in files) self.assertTrue(os.path.join(workdir, 'bzr.mfd-consult.dk,2007,venus-expunge-test4,3') in files)
def run(script, doc, output_file=None, options={}): """ process an Genshi template """ context = Context(**options) tmpl_fileobj = open(script) tmpl = MarkupTemplate(tmpl_fileobj, script) tmpl_fileobj.close() if not output_file: # filter context.push({'input':XMLParser(StringIO(doc))}) else: # template import time from planet import config,feedparser from planet.spider import filename # gather a list of subscriptions, feeds global subscriptions feeds = [] sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) data.feed.config = norm(dict(config.parser.items(sub))) if data.feed.has_key('link'): feeds.append((data.feed.config.get('name',''),data.feed)) subscriptions.append(norm(sub)) feeds.sort() # annotate each entry new_date_format = config.new_date_format() vars = feedparser.parse(StringIO(doc)) vars.feeds = [value for name,value in feeds] last_feed = None last_date = None for entry in vars.entries: entry.source.config = find_config(config, entry.source) # add new_feed and new_date fields if 'id' in entry.source: entry.new_feed = entry.source.id else: entry.new_feed = None entry.new_date = date = None if entry.has_key('published_parsed'): date=entry.published_parsed if entry.has_key('updated_parsed'): date=entry.updated_parsed if date: entry.new_date = time.strftime(new_date_format, date) # remove new_feed and new_date fields if not "new" if entry.new_date == last_date: entry.new_date = None if entry.new_feed == last_feed: entry.new_feed = None else: last_feed = entry.new_feed elif entry.new_date: last_date = entry.new_date last_feed = None # add streams for all text constructs for key in entry.keys(): if key.endswith("_detail") and entry[key].has_key('type') and \ entry[key].has_key('value'): streamify(entry[key],entry.source.planet_bozo) if entry.has_key('content'): for content in entry.content: streamify(content,entry.source.planet_bozo) # add cumulative feed information to the Genshi context vars.feed.config = dict(config.parser.items('Planet',True)) context.push(vars) # apply template output=tmpl.generate(context).render('xml') if output_file: out_file = open(output_file,'w') out_file.write(output) out_file.close() else: return output
from html5lib import html5parser, treebuilders from ConfigParser import ConfigParser # load config files (default: config.ini) for arg in sys.argv[1:]: config.load(arg) if len(sys.argv) == 1: config.load('config.ini') from Queue import Queue from threading import Thread # determine which subscriptions have no icon but do have a html page fetch_queue = Queue() html = ['text/html', 'application/xhtml+xml'] sources = config.cache_sources_directory() for sub in config.subscriptions(): data = feedparser.parse(filename(sources, sub)) if data.feed.get('icon'): continue if not data.feed.get('links'): continue for link in data.feed.links: if link.rel == 'alternate' and link.type in html: fetch_queue.put((sub, link.href)) break # find the favicon for a given webpage def favicon(page): parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom')) doc = parser.parse(urlopen(page)) favicon = urljoin(page, '/favicon.ico')
def run(script, doc, output_file=None, options={}): """ process an Genshi template """ context = Context(**options) tmpl_fileobj = open(script) tmpl = MarkupTemplate(tmpl_fileobj, script) tmpl_fileobj.close() if not output_file: # filter context.push({'input':XMLParser(StringIO(doc))}) else: # template import time from planet import config,feedparser from planet.spider import filename # gather a list of subscriptions, feeds global subscriptions feeds = [] sources = config.cache_sources_directory() for sub in config.subscriptions(): data=feedparser.parse(filename(sources,sub)) data.feed.config = norm(dict(config.parser.items(sub))) if data.feed.has_key('link'): feeds.append((data.feed.config.get('name',''),data.feed)) subscriptions.append(norm(sub)) feeds.sort() # annotate each entry new_date_format = config.new_date_format() vars = feedparser.parse(StringIO(doc)) vars.feeds = [value for name,value in feeds] last_feed = None last_date = None for entry in vars.entries: entry.source.config = find_config(config, entry.source) # add new_feed and new_date fields entry.new_feed = entry.source.id entry.new_date = date = None if entry.has_key('published_parsed'): date=entry.published_parsed if entry.has_key('updated_parsed'): date=entry.updated_parsed if date: entry.new_date = time.strftime(new_date_format, date) # remove new_feed and new_date fields if not "new" if entry.new_date == last_date: entry.new_date = None if entry.new_feed == last_feed: entry.new_feed = None else: last_feed = entry.new_feed elif entry.new_date: last_date = entry.new_date last_feed = None # add streams for all text constructs for key in entry.keys(): if key.endswith("_detail") and entry[key].has_key('type') and \ entry[key].has_key('value'): streamify(entry[key],entry.source.planet_bozo) if entry.has_key('content'): for content in entry.content: streamify(content,entry.source.planet_bozo) # add cumulative feed information to the Genshi context vars.feed.config = dict(config.parser.items('Planet',True)) context.push(vars) # apply template output=tmpl.generate(context).render('xml') if output_file: out_file = open(output_file,'w') out_file.write(output) out_file.close() else: return output