Exemple #1
0
class HTTPScraper(Scraper):
    """Base class for scrapers that require an http opener"""
    def __init__(self, *args, **kargs):
        super(HTTPScraper, self).__init__(*args, **kargs)
        # TODO: this should be moved to _initialize, but then _initialize should
        # be moved to some sort of listener-structure as HTTPScraper is expected to
        # be inherited from besides eg DBScraper in a "diamon-shaped" multi-inheritance
        self.opener = HTTPOpener()
    def getdoc(self, url, encoding=None):
        try:
            return self.opener.getdoc(url, encoding)
        except UnicodeEncodeError:
            uri = iri2uri(url)
            return self.opener.getdoc(uri, encoding)

    def open(self, url,  encoding=None):
        if isinstance(url, (str, unicode)):
            if isinstance(url, unicode):
                url = url.encode('utf-8')
            log.info('Retrieving "{url}"'.format(**locals()))
            try:
                return self.opener.opener.open(url, encoding)
            except UnicodeEncodeError:
                uri = iri2uri(url)
                return self.opener.opener.open(uri, encoding)
        else:
            req = url
            log.info('Retrieving "{url}"'.format(url = req.get_full_url()))
            return self.opener.opener.open(req, encoding)
Exemple #2
0
class HTTPScraper(Scraper):
    """Base class for scrapers that require an http opener"""
    def __init__(self, *args, **kargs):
        super(HTTPScraper, self).__init__(*args, **kargs)
        # TODO: this should be moved to _initialize, but then _initialize should
        # be moved to some sort of listener-structure as HTTPScraper is expected to
        # be inherited from besides eg DBScraper in a "diamon-shaped" multi-inheritance
        self.opener = HTTPOpener()
    def getdoc(self, url, encoding=None):
        """Legacy/convenience function"""
        return self.opener.getdoc(url, encoding)
Exemple #3
0
 def __init__(self, *args, **kargs):
     super(HTTPScraper, self).__init__(*args, **kargs)
     # TODO: this should be moved to _initialize, but then _initialize should
     # be moved to some sort of listener-structure as HTTPScraper is expected to
     # be inherited from besides eg DBScraper in a "diamon-shaped" multi-inheritance
     self.opener = HTTPOpener()