class HTTPScraper(Scraper): """Base class for scrapers that require an http opener""" def __init__(self, *args, **kargs): super(HTTPScraper, self).__init__(*args, **kargs) # TODO: this should be moved to _initialize, but then _initialize should # be moved to some sort of listener-structure as HTTPScraper is expected to # be inherited from besides eg DBScraper in a "diamon-shaped" multi-inheritance self.opener = HTTPOpener() def getdoc(self, url, encoding=None): try: return self.opener.getdoc(url, encoding) except UnicodeEncodeError: uri = iri2uri(url) return self.opener.getdoc(uri, encoding) def open(self, url, encoding=None): if isinstance(url, (str, unicode)): if isinstance(url, unicode): url = url.encode('utf-8') log.info('Retrieving "{url}"'.format(**locals())) try: return self.opener.opener.open(url, encoding) except UnicodeEncodeError: uri = iri2uri(url) return self.opener.opener.open(uri, encoding) else: req = url log.info('Retrieving "{url}"'.format(url = req.get_full_url())) return self.opener.opener.open(req, encoding)
class HTTPScraper(Scraper): """Base class for scrapers that require an http opener""" def __init__(self, *args, **kargs): super(HTTPScraper, self).__init__(*args, **kargs) # TODO: this should be moved to _initialize, but then _initialize should # be moved to some sort of listener-structure as HTTPScraper is expected to # be inherited from besides eg DBScraper in a "diamon-shaped" multi-inheritance self.opener = HTTPOpener() def getdoc(self, url, encoding=None): """Legacy/convenience function""" return self.opener.getdoc(url, encoding)
def __init__(self, *args, **kargs): super(HTTPScraper, self).__init__(*args, **kargs) # TODO: this should be moved to _initialize, but then _initialize should # be moved to some sort of listener-structure as HTTPScraper is expected to # be inherited from besides eg DBScraper in a "diamon-shaped" multi-inheritance self.opener = HTTPOpener()