Example #1
0
 def init_phantom(self):
     self.prefixfiles = os.path.join(
         scrapyd_config().get('logs_dir'),
         HYPHE_PROJECT,
         self.name,
         self.crawler.settings['JOBID']
     )
     self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO)
     phantom_args = []
     if PROXY and not PROXY.startswith(':'):
         phantom_args.append('--proxy=%s' % PROXY)
     phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles)
     phantom_args.append('--ignore-ssl-errors=true')
     phantom_args.append('--load-images=false')
     self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
     self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent
     self.capabilities['takesScreenshot'] = False
     self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
     self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
     self.phantom = PhantomJS(
         executable_path=PHANTOM['PATH'],
         service_args=phantom_args,
         desired_capabilities=self.capabilities,
         service_log_path="%s-phantomjs.log" % self.prefixfiles
     )
     self.phantom.implicitly_wait(10)
     self.phantom.set_page_load_timeout(60)
     self.phantom.set_script_timeout(self.ph_timeout + 15)
Example #2
0
 def init_phantom(self):
     self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                     HYPHE_PROJECT, self.name,
                                     self.crawler.settings['JOBID'])
     self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
              log.INFO)
     phantom_args = []
     if PROXY and not PROXY.startswith(':'):
         phantom_args.append('--proxy=%s' % PROXY)
     phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                         self.prefixfiles)
     phantom_args.append('--ignore-ssl-errors=true')
     phantom_args.append('--load-images=false')
     self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
     self.capabilities[
         'phantomjs.page.settings.userAgent'] = self.user_agent
     self.capabilities['takesScreenshot'] = False
     self.capabilities[
         'phantomjs.page.settings.javascriptCanCloseWindows'] = False
     self.capabilities[
         'phantomjs.page.settings.javascriptCanOpenWindows'] = False
     self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                              service_args=phantom_args,
                              desired_capabilities=self.capabilities,
                              service_log_path="%s-phantomjs.log" %
                              self.prefixfiles)
     self.phantom.implicitly_wait(10)
     self.phantom.set_page_load_timeout(60)
     self.phantom.set_script_timeout(self.ph_timeout + 15)
Example #3
0
 def handle_error(self, failure, response=None):
     if response:
         p = self._make_raw_page(response, failure.request.url)
         p['error'] = error_name(failure.value)
         return p
     elif not "://www" in failure.request.url:
         return self._request(failure.request.url.replace('://', '://www.'))
     error = failure.getErrorMessage()
     self.log("ERROR : %s" % error, log.ERROR)
     if PROXY and not PROXY.startswith(':') and "OpenSSL.SSL.Error" in error:
         return self._request(failure.request.url, noproxy=True)
     self.errors += 1
     return
Example #4
0
 def handle_error(self, failure, response=None):
     if response:
         p = self._make_raw_page(response, failure.request.url)
         p['error'] = error_name(failure.value)
         return p
     elif not "://www" in failure.request.url:
         return self._request(failure.request.url.replace('://', '://www.'))
     error = failure.getErrorMessage()
     self.log("ERROR : %s" % error, logging.ERROR)
     if PROXY and not PROXY.startswith(':') and "OpenSSL.SSL.Error" in error:
         return self._request(failure.request.url, noproxy=True)
     self.errors += 1
     return
Example #5
0
def use_proxy(request):
    return PROXY != "" and not PROXY.startswith(':') and not request.meta['noproxy']
Example #6
0
def use_proxy(request):
    return PROXY != "" and not PROXY.startswith(':') and not request.meta['noproxy']
Example #7
0
 def process_request(self, request, spider):
     if PROXY != "" and not PROXY.startswith(':'):
         request.meta['proxy'] = "http://%s/" % PROXY
Example #8
0
 def process_request(self, request, spider):
     if PROXY != "" and not PROXY.startswith(':'):
         request.meta['proxy'] = "http://%s/" % PROXY