Beispiel #1
0
 def __init__(self, snapper, config, logger=None):
     self.config = config
     self.snapper = snapper
     self.app = QtGui.QApplication([])
     if not logger:
         from log import NullHandler
         logger = NullHandler()
     self.logger = logger
     self.robotparser = RobotTxtParser()
     self.urls = list()
     if self.config.http_proxy:
         if all((self.config.proxy_user, self.config.proxy_passwd)):
             proxy_url = (
                     '%s:%s@%s:%s'
                     %(
                         self.config.proxy_user,
                         self.config.proxy_passwd,
                         self.config.proxy_host,
                         self.config.proxy_port
                     )
             )
         else:
             proxy_url = (
                     '%s:%s'
                     %(
                         self.config.proxy_host,
                         self.config.proxy_port
                     )
             )
         os.environ['http_proxy'] = 'http://' + proxy_url
         os.environ['https_proxy'] = 'https://' + proxy_url
Beispiel #2
0
class SnapshotApp(object):
    def __init__(self, snapper, config, logger=None):
        self.config = config
        self.snapper = snapper
        self.app = QtGui.QApplication([])
        if not logger:
            from log import NullHandler
            logger = NullHandler()
        self.logger = logger
        self.robotparser = RobotTxtParser()
        self.urls = list()
        if self.config.http_proxy:
            if all((self.config.proxy_user, self.config.proxy_passwd)):
                proxy_url = (
                        '%s:%s@%s:%s'
                        %(
                            self.config.proxy_user,
                            self.config.proxy_passwd,
                            self.config.proxy_host,
                            self.config.proxy_port
                        )
                )
            else:
                proxy_url = (
                        '%s:%s'
                        %(
                            self.config.proxy_host,
                            self.config.proxy_port
                        )
                )
            os.environ['http_proxy'] = 'http://' + proxy_url
            os.environ['https_proxy'] = 'https://' + proxy_url

    def checkRobotsTxt(self, url):
        self.robotparser.set_url(url)
        self.robotparser.read()
        if not self.robotparser.can_fetch(self.config.useragent, url):
            self.logger.debug(
                    'Useragent »%s« is not allowed to fetch %s'
                    %(self.config.useragent, url)
            )
            return False
        self.logger.debug(
                'Useragent »%s« is allowed to fetch %s'
                %(self.config.useragent, url)
        )
        return True

    def setUrls(self, *urls):
        for url in urls:
            if not (
                    url.startswith('http://') or
                    url.startswith('https://')
            ):
                url = 'http://' + url
            try:
                self.urls.append(HttpUrl(url))
            except (HttpUrl.HttpUrlError, HttpUrl.UrlError):
                self.logger.info(
                        '%s is not a valid url ... ignoring it.'
                        %(url)
                )
                continue
            else:
                self.logger.debug('Adding %s to urls' %(url))
        return self.urls

    def _snap(self, url):
        if self.config.honor_robots_txt:
            if not self.checkRobotsTxt(url):
                self.exit()
        if self.config.urlpart:
            self.config.urlpart = url.toplevel
        self._snapper = self.snapper(self.config, logger=self.logger)
        self._snapper.connect(
                self._snapper,
                QtCore.SIGNAL('finished()'),
                self.start
        )
        self._snapper.snap(url)

    def start(self, *urls):
        if urls:
            self.setUrls(*urls)
        if self.urls:
            url = self.urls.pop(0)
            self._snap(url)
        else:
            self.logger.info('Exiting now!')
            self.app.quit()