def eval(self, name):
        # read the test case
        try:
            testcase = open(testfiles % name)
            data = testcase.read()
            description, expect = self.desc_re.search(data).groups()
            testcase.close()
        except:
            raise RuntimeError, "can't parse %s" % name

        # parse and reconstitute to a string
        work = StringIO.StringIO()
        results = feedparser.parse(data)
        scrub(testfiles%name, results)
        reconstitute(results, results.entries[0]).writexml(work)

        # verify the results
        results = feedparser.parse(work.getvalue().encode('utf-8'))
        if 'illegal' not in name:
            self.assertFalse(results.bozo, 'xml is well formed')
        if not self.simple_re.match(expect):
            self.assertTrue(eval(expect, results.entries[0]), expect)
        else:
            lhs, rhs = self.simple_re.match(expect).groups()
            self.assertEqual(eval(rhs), eval(lhs, results.entries[0]))
    def eval(self, name):
        # read the test case
        try:
            testcase = open(testfiles % name)
            data = testcase.read()
            description, expect = self.desc_re.search(data).groups()
            testcase.close()
        except:
            raise RuntimeError, "can't parse %s" % name

        # parse and reconstitute to a string
        work = StringIO.StringIO()
        results = feedparser.parse(data)
        scrub(testfiles % name, results)
        reconstitute(results, results.entries[0]).writexml(work)

        # verify the results
        results = feedparser.parse(work.getvalue().encode('utf-8'))
        if 'illegal' not in name:
            self.assertFalse(results.bozo, 'xml is well formed')
        if not self.simple_re.match(expect):
            self.assertTrue(eval(expect, results.entries[0]), expect)
        else:
            lhs, rhs = self.simple_re.match(expect).groups()
            self.assertEqual(eval(rhs), eval(lhs, results.entries[0]))
Exemple #3
0
    def test_scrub_xmlbase(self):
        base = feedparser.parse(feed)
        self.assertEqual('http://example.com/',
                         base.entries[0].title_detail.base)

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set('testfeed', 'xml_base', 'feed_alternate')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/feed/',
                         data.entries[0].title_detail.base)

        config.parser.set('testfeed', 'xml_base', 'entry_alternate')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/entry/1/',
                         data.entries[0].title_detail.base)

        config.parser.set('testfeed', 'xml_base', 'base/')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/base/',
                         data.entries[0].title_detail.base)

        config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.org/data/',
                         data.entries[0].title_detail.base)
    def test_scrub_xmlbase(self):
        base = feedparser.parse(feed)
        self.assertEqual('http://example.com/',
             base.entries[0].title_detail.base)

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set('testfeed', 'xml_base', 'feed_alternate')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/feed/',
             data.entries[0].title_detail.base)

        config.parser.set('testfeed', 'xml_base', 'entry_alternate')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/entry/1/',
             data.entries[0].title_detail.base)

        config.parser.set('testfeed', 'xml_base', 'base/')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/base/',
             data.entries[0].title_detail.base)

        config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.org/data/',
             data.entries[0].title_detail.base)
Exemple #5
0
 def test_spiderFeedUpdatedEntries(self):
     config.load(configfile)
     self.spiderFeed(testfeed % '4')
     self.assertEqual(2, len(glob.glob(workdir+"/*")))
     data = feedparser.parse(workdir + 
         '/planet.intertwingly.net,2006,testfeed4')
     self.assertEqual(u'three', data.entries[0].content[0].value)
Exemple #6
0
    def test_scrub_future(self):
        base = feedparser.parse(feed)
        self.assertEqual(1, len(base.entries))
        self.assertTrue('updated' in base.entries[0].keys())

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set('testfeed', 'future_dates', 'ignore_date')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertFalse('updated' in data.entries[0].keys())

        config.parser.set('testfeed', 'future_dates', 'ignore_entry')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual(0, len(data.entries))
    def test_scrub_future(self):
        base = feedparser.parse(feed)
        self.assertEqual(1, len(base.entries))
        self.assertTrue(base.entries[0].has_key("updated"))

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set("testfeed", "future_dates", "ignore_date")
        data = deepcopy(base)
        scrub("testfeed", data)
        self.assertFalse(data.entries[0].has_key("updated"))

        config.parser.set("testfeed", "future_dates", "ignore_entry")
        data = deepcopy(base)
        scrub("testfeed", data)
        self.assertEqual(0, len(data.entries))
    def test_scrub_future(self):
        base = feedparser.parse(feed)
        self.assertEqual(1, len(base.entries))
        self.assertTrue(base.entries[0].has_key('updated'))

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set('testfeed', 'future_dates', 'ignore_date')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertFalse(data.entries[0].has_key('updated'))

        config.parser.set('testfeed', 'future_dates', 'ignore_entry')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual(0, len(data.entries))
  def fetch(self):
    info = feedparser.parse(self.url)
    if info.has_key("status"):
      self.url_status = str(info.status)
    elif info.has_key("entries") and len(info.entries)>0:
      self.url_status = str(200)
    elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout':
      self.url_status = str(408)
    else:
      self.url_status = str(500)

    self.info = info
    self.last_updated = self.updated
    self.updated = time.gmtime()
    logging.info("Fetch Complete(%s) for %s", self.url_status, self.url)
    def test_scrub_type(self):
        base = feedparser.parse(feed)

        self.assertEqual('Föo', base.feed.author_detail.name)

        config.parser.readfp(StringIO.StringIO(configData))
        data = deepcopy(base)
        scrub('testfeed', data)

        self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
        self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
        self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)

        self.assertEqual('text/html', data.entries[0].title_detail.type)
        self.assertEqual('text/html', data.entries[0].summary_detail.type)
        self.assertEqual('text/html', data.entries[0].content[0].type)
Exemple #11
0
    def test_scrub_type(self):
        base = feedparser.parse(feed)

        self.assertEqual('Föo', base.feed.author_detail.name)

        config.parser.readfp(StringIO.StringIO(configData))
        data = deepcopy(base)
        scrub('testfeed', data)

        self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
        self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
        self.assertEqual('F\xc3\xb6o',
                         data.entries[0].source.author_detail.name)

        self.assertEqual('text/html', data.entries[0].title_detail.type)
        self.assertEqual('text/html', data.entries[0].summary_detail.type)
        self.assertEqual('text/html', data.entries[0].content[0].type)
Exemple #12
0
    def test_scrub_ignore(self):
        base = feedparser.parse(feed)

        self.assertTrue(base.entries[0].has_key('id'))
        self.assertTrue(base.entries[0].has_key('updated'))
        self.assertTrue(base.entries[0].has_key('updated_parsed'))
        self.assertTrue(base.entries[0].summary_detail.has_key('language'))

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
        data = deepcopy(base)
        scrub('testfeed', data)

        self.assertFalse(data.entries[0].has_key('id'))
        self.assertFalse(data.entries[0].has_key('updated'))
        self.assertFalse(data.entries[0].has_key('updated_parsed'))
        self.assertFalse(data.entries[0].summary_detail.has_key('language'))
Exemple #13
0
    def verify_spiderPlanet(self):
        files = glob.glob(workdir+"/*")

        # verify that exactly eight files + 1 source dir were produced
        self.assertEqual(14, len(files))

        # verify that the file names are as expected
        self.assertTrue(os.path.join(workdir,
            'planet.intertwingly.net,2006,testfeed1,1') in files)
        self.assertTrue(os.path.join(workdir,
            'planet.intertwingly.net,2006,testfeed2,1') in files)

        data = feedparser.parse(workdir + 
            '/planet.intertwingly.net,2006,testfeed3,1')
        self.assertEqual(['application/rss+xml'], [link.type
            for link in data.entries[0].source.links if link.rel=='self'])
        self.assertEqual('three', data.entries[0].source.author_detail.name)
        self.assertEqual('three', data.entries[0].source['planet_css-id'])
Exemple #14
0
    def verify_spiderFeed(self):
        files = glob.glob(workdir+"/*")
        files.sort()

        # verify that exactly four files + one sources dir were produced
        self.assertEqual(5, len(files))

        # verify that the file names are as expected
        self.assertTrue(os.path.join(workdir,
            'planet.intertwingly.net,2006,testfeed1,1') in files)

        # verify that the file timestamps match atom:updated
        data = feedparser.parse(files[2])
        self.assertEqual(['application/atom+xml'], [link.type
            for link in data.entries[0].source.links if link.rel=='self'])
        self.assertEqual('one', data.entries[0].source.planet_name)
        self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated)
        self.assertEqual(os.stat(files[2]).st_mtime,
            calendar.timegm(data.entries[0].updated_parsed))
    def test_scrub_ignore(self):
        base = feedparser.parse(feed)

        self.assertTrue(base.entries[0].has_key("author"))
        self.assertTrue(base.entries[0].has_key("author_detail"))
        self.assertTrue(base.entries[0].has_key("id"))
        self.assertTrue(base.entries[0].has_key("updated"))
        self.assertTrue(base.entries[0].has_key("updated_parsed"))
        self.assertTrue(base.entries[0].summary_detail.has_key("language"))

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set("testfeed", "ignore_in_feed", "author id updated xml:lang")
        data = deepcopy(base)
        scrub("testfeed", data)

        self.assertFalse(data.entries[0].has_key("author"))
        self.assertFalse(data.entries[0].has_key("author_detail"))
        self.assertFalse(data.entries[0].has_key("id"))
        self.assertFalse(data.entries[0].has_key("updated"))
        self.assertFalse(data.entries[0].has_key("updated_parsed"))
        self.assertFalse(data.entries[0].summary_detail.has_key("language"))
Exemple #16
0
    def test_scrub_ignore(self):
        base = feedparser.parse(feed)

        self.assertTrue('author' in base.entries[0].keys())
        self.assertTrue('author_detail' in base.entries[0].keys())
        self.assertTrue('id' in base.entries[0].keys())
        self.assertTrue('updated' in base.entries[0].keys())
        self.assertTrue('updated_parsed' in base.entries[0].keys())
        self.assertTrue('language' in base.entries[0].summary_detail.keys())

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set('testfeed', 'ignore_in_feed',
                          'author id updated xml:lang')
        data = deepcopy(base)
        scrub('testfeed', data)

        self.assertFalse('author' in data.entries[0].keys())
        self.assertFalse('author_detail' in data.entries[0].keys())
        self.assertFalse('id' in data.entries[0].keys())
        self.assertFalse('updated' in data.entries[0].keys())
        self.assertFalse('updated_parsed' in data.entries[0].keys())
        self.assertFalse('language' in data.entries[0].summary_detail.keys())
    def verify_date(self, id, expected_date=None):

        file = os.path.join(workdir, id)

        # verify that the file exists
        self.assertTrue(os.path.exists(file), msg=file)

        data = feedparser.parse(file)

        # verify published & updated dates are in sync and match expected

        self.assertEqual(data.entries[0].updated, data.entries[0].published)

        # verify mtime is in sync
        self.assertEqual(time.gmtime(os.stat(file).st_mtime), data.entries[0].updated_parsed)
        self.assertEqual(time.gmtime(os.stat(file).st_mtime), data.entries[0].published_parsed)

        # verify meet hardcoded expectations
        if expected_date is not None:
            self.assertEqual(expected_date, data.entries[0].updated)

        return data.entries[0].updated
    def test_scrub_xmlbase(self):
        base = feedparser.parse(feed)
        self.assertEqual("http://example.com/", base.entries[0].title_detail.base)

        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set("testfeed", "xml_base", "feed_alternate")
        data = deepcopy(base)
        scrub("testfeed", data)
        self.assertEqual("http://example.com/feed/", data.entries[0].title_detail.base)

        config.parser.set("testfeed", "xml_base", "entry_alternate")
        data = deepcopy(base)
        scrub("testfeed", data)
        self.assertEqual("http://example.com/entry/1/", data.entries[0].title_detail.base)

        config.parser.set("testfeed", "xml_base", "base/")
        data = deepcopy(base)
        scrub("testfeed", data)
        self.assertEqual("http://example.com/base/", data.entries[0].title_detail.base)

        config.parser.set("testfeed", "xml_base", "http://example.org/data/")
        data = deepcopy(base)
        scrub("testfeed", data)
        self.assertEqual("http://example.org/data/", data.entries[0].title_detail.base)
Exemple #19
0
def run(script, doc, output_file=None, options={}):
    """ process an Genshi template """

    context = Context(**options)

    tmpl_fileobj = open(script)
    tmpl = MarkupTemplate(tmpl_fileobj, script)
    tmpl_fileobj.close()

    if not output_file: 
        # filter
        context.push({'input':XMLParser(StringIO(doc))})
    else:
        # template
        import time
        from planet import config,feedparser
        from planet.spider import filename

        # gather a list of subscriptions, feeds
        global subscriptions
        feeds = []
        sources = config.cache_sources_directory()
        for sub in config.subscriptions():
            data=feedparser.parse(filename(sources,sub))
            data.feed.config = norm(dict(config.parser.items(sub)))
            if data.feed.has_key('link'):
                feeds.append((data.feed.config.get('name',''),data.feed))
            subscriptions.append(norm(sub))
        feeds.sort()

        # annotate each entry
        new_date_format = config.new_date_format()
        vars = feedparser.parse(StringIO(doc))
        vars.feeds = [value for name,value in feeds]
        last_feed = None
        last_date = None
        for entry in vars.entries:
             entry.source.config = find_config(config, entry.source)

             # add new_feed and new_date fields
             entry.new_feed = entry.source.id
             entry.new_date = date = None
             if entry.has_key('published_parsed'): date=entry.published_parsed
             if entry.has_key('updated_parsed'): date=entry.updated_parsed
             if date: entry.new_date = time.strftime(new_date_format, date)

             # remove new_feed and new_date fields if not "new"
             if entry.new_date == last_date:
                 entry.new_date = None
                 if entry.new_feed == last_feed:
                     entry.new_feed = None
                 else:
                     last_feed = entry.new_feed
             elif entry.new_date:
                 last_date = entry.new_date
                 last_feed = None

             # add streams for all text constructs
             for key in entry.keys():
                 if key.endswith("_detail") and entry[key].has_key('type') and \
                     entry[key].has_key('value'):
                     streamify(entry[key],entry.source.planet_bozo)
             if entry.has_key('content'):
                 for content in entry.content:
                     streamify(content,entry.source.planet_bozo)
     
        # add cumulative feed information to the Genshi context
        vars.feed.config = dict(config.parser.items('Planet',True))
        context.push(vars)

    # apply template
    output=tmpl.generate(context).render('xml')

    if output_file:
        out_file = open(output_file,'w')
        out_file.write(output)
        out_file.close()
    else:
        return output
Exemple #20
0
 def spiderFeed(self, feed_uri):
     feed_info = feedparser.parse('<feed/>')
     data = feedparser.parse(feed_uri)
     writeCache(feed_uri, feed_info, data)
Exemple #21
0
# load config files (default: config.ini)
for arg in sys.argv[1:]:
    config.load(arg)
if len(sys.argv) == 1:
    config.load('config.ini')

from Queue import Queue
from threading import Thread

# determine which subscriptions have no icon but do have a html page
fetch_queue = Queue()
html = ['text/html', 'application/xhtml+xml']
sources = config.cache_sources_directory()
for sub in config.subscriptions():
    data = feedparser.parse(filename(sources, sub))
    if data.feed.get('icon'): continue
    if not data.feed.get('links'): continue
    for link in data.feed.links:
        if link.rel == 'alternate' and link.type in html:
            fetch_queue.put((sub, link.href))
            break


# find the favicon for a given webpage
def favicon(page):
    parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(urlopen(page))
    favicon = urljoin(page, '/favicon.ico')
    for link in doc.getElementsByTagName('link'):
        if link.hasAttribute('rel') and link.hasAttribute('href'):
        parser.set('Planet','template_files','themes/common/atom.xml.xslt')
    else:
        parser.set('Planet','template_files','tests/data/reconstitute.xslt')

    for name, value in zip(sys.argv[2::2],sys.argv[3::2]):
        parser.set(sys.argv[1], name.lstrip('-'), value)

    from planet import config
    config.parser = parser

    from planet import spider
    spider.spiderPlanet(only_if_new=False)

    from planet import feedparser
    for source in glob.glob(os.path.join(work, 'sources/*')):
        feed = feedparser.parse(source).feed
        if feed.has_key('title'):
            config.parser.set('Planet','name',feed.title_detail.value)
        if feed.has_key('link'):
            config.parser.set('Planet','link',feed.link)
        if feed.has_key('author_detail'):
            if feed.author_detail.has_key('name'):
                config.parser.set('Planet','owner_name',feed.author_detail.name)
            if feed.author_detail.has_key('email'):
                config.parser.set('Planet','owner_email',feed.author_detail.email)

    from planet import splice
    doc = splice.splice()

    sources = doc.getElementsByTagName('planet:source')
    if hide_planet_ns and len(sources) == 1:
Exemple #23
0
def run(script, doc, output_file=None, options={}):
    """ process an Genshi template """

    context = Context(**options)

    tmpl_fileobj = open(script)
    tmpl = MarkupTemplate(tmpl_fileobj, script)
    tmpl_fileobj.close()

    if not output_file: 
        # filter
        context.push({'input':XMLParser(StringIO(doc))})
    else:
        # template
        import time
        from planet import config,feedparser
        from planet.spider import filename

        # gather a list of subscriptions, feeds
        global subscriptions
        feeds = []
        sources = config.cache_sources_directory()
        for sub in config.subscriptions():
            data=feedparser.parse(filename(sources,sub))
            data.feed.config = norm(dict(config.parser.items(sub)))
            if data.feed.has_key('link'):
                feeds.append((data.feed.config.get('name',''),data.feed))
            subscriptions.append(norm(sub))
        feeds.sort()

        # annotate each entry
        new_date_format = config.new_date_format()
        vars = feedparser.parse(StringIO(doc))
        vars.feeds = [value for name,value in feeds]
        last_feed = None
        last_date = None
        for entry in vars.entries:
             entry.source.config = find_config(config, entry.source)

             # add new_feed and new_date fields
             if 'id' in entry.source:
                 entry.new_feed = entry.source.id
             else:
                 entry.new_feed = None
             entry.new_date = date = None
             if entry.has_key('published_parsed'): date=entry.published_parsed
             if entry.has_key('updated_parsed'): date=entry.updated_parsed
             if date: entry.new_date = time.strftime(new_date_format, date)

             # remove new_feed and new_date fields if not "new"
             if entry.new_date == last_date:
                 entry.new_date = None
                 if entry.new_feed == last_feed:
                     entry.new_feed = None
                 else:
                     last_feed = entry.new_feed
             elif entry.new_date:
                 last_date = entry.new_date
                 last_feed = None

             # add streams for all text constructs
             for key in entry.keys():
                 if key.endswith("_detail") and entry[key].has_key('type') and \
                     entry[key].has_key('value'):
                     streamify(entry[key],entry.source.planet_bozo)
             if entry.has_key('content'):
                 for content in entry.content:
                     streamify(content,entry.source.planet_bozo)
     
        # add cumulative feed information to the Genshi context
        vars.feed.config = dict(config.parser.items('Planet',True))
        context.push(vars)

    # apply template
    output=tmpl.generate(context).render('xml')

    if output_file:
        out_file = open(output_file,'w')
        out_file.write(output)
        out_file.close()
    else:
        return output
Exemple #24
0
def template_info(source):
    """ get template information from a feedparser output """

    # wire in support for planet:source, call feedparser, unplug planet:source
    mixin=feedparser._FeedParserMixin
    mixin._start_planet_source = mixin._start_source
    mixin._end_planet_source = \
        new.instancemethod(_end_planet_source, None, mixin)
    data=feedparser.parse(source)
    del mixin._start_planet_source
    del mixin._end_planet_source

    # apply rules to convert feed parser output to htmltmpl input
    output = {'Channels': [], 'Items': []}
    output.update(tmpl_mapper(data.feed, Base))
    sources = [(source.get('planet_name',None),source)
        for source in data.feed.get('sources',[])]
    sources.sort()
    for name, feed in sources:
        output['Channels'].append(tmpl_mapper(feed, Base))
    for entry in data.entries:
        output['Items'].append(tmpl_mapper(entry, Items))

    # synthesize isPermaLink attribute
    for item in output['Items']:
        if item.get('id') == item.get('link'):
            item['guid_isPermaLink']='true'
        else:
            item['guid_isPermaLink']='false'

    # feed level information
    output['generator'] = config.generator_uri()
    output['name'] = config.name()
    output['link'] = config.link()
    output['owner_name'] = config.owner_name()
    output['owner_email'] = config.owner_email()
    if config.feed():
        output['feed'] = config.feed()
        output['feedtype'] = config.feed().find('rss')>=0 and 'rss' or 'atom'

    # date/time information
    date = time.gmtime()
    output['date'] = PlanetDate(date)
    output['date_iso'] = Rfc3399(date)
    output['date_822'] = Rfc822(date)

    # remove new_dates and new_channels that aren't "new"
    date = channel = None
    for item in output['Items']:
        if item.has_key('new_date'):
            if item['new_date'] == date:
                del item['new_date']
            else:
                date = item['new_date']

        if item.has_key('new_channel'):
            if item['new_channel'] == channel and not item.has_key('new_date'):
                del item['new_channel']
            else:
                channel = item['new_channel']

    return output