def init_phantom(self): self.prefixfiles = os.path.join( scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID'] ) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS( executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles ) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15)
def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15)
def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, log.ERROR) if PROXY and not PROXY.startswith(':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return
def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, logging.ERROR) if PROXY and not PROXY.startswith(':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return
def use_proxy(request): return PROXY != "" and not PROXY.startswith(':') and not request.meta['noproxy']
def process_request(self, request, spider): if PROXY != "" and not PROXY.startswith(':'): request.meta['proxy'] = "http://%s/" % PROXY