def check_startrequest(self, request): h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy']) try: h.get_requests() except NotHtmlException: print "\nError: Document is not html" sys.exit(1) except Exception as e: print "\nError: unable to open url: %s" % e sys.exit(1)
def _check_request(request): """ check if the given request resolve and return proper html file :param request: :return: """ h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['user_agent'], Shared.options['proxy']) try: h.get_requests() except NotHtmlException: print("\nError: Document is not html") sys.exit(1) except Exception as e: print("\nError: unable to open url: %s" % e) sys.exit(1)
def crawl(self): while True: url = None cookies = [] requests = [] requests_to_crawl = [] redirects = 0 errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print "-->" + str(e) continue url = request.url purl = urlsplit(url) probe = None probe = self.send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: requests = probe.requests if probe.html: request.html = probe.html else: errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if Shared.options['use_urllib_onerror'] == False: continue try: hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors) Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release()
def crawl(self): while True: url = None cookies = [] requests = [] requests_to_crawl = [] redirects = 0 errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print "-->"+str(e) continue url = request.url purl = urlsplit(url) probe = None probe = self.send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: requests = probe.requests if probe.html: request.html = probe.html else : errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if Shared.options['use_urllib_onerror'] == False: continue try: hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors) Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release()
def crawl(self): while True: if self.exit: return requests = [] errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print("crawl_thread err -->" + e) continue probe = self.send_probe(request, errors) if probe: requests = probe.requests if probe.html: request.html = probe.html if probe.page_hash: request.page_hash = probe.page_hash if len(probe.user_output) > 0: request.user_output = probe.user_output errors.append(probe.errmessage) else: errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if not Shared.options['use_urllib_onerror']: continue try: hr = HttpGet(request, Shared.options['process_timeout'], 1, Shared.options['useragent'], Shared.options['proxy'], Shared.options['extra_headers']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) requests = adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors, probe.page_hash if probe else "") Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release() self.wait_pause()
def _crawl(self): while True: requests = [] errors = [] try: request = self._wait_request() except ThreadExitRequestException: if os.path.exists(self._cookie_file): os.remove(self._cookie_file) return except Exception as e: print("-->" + str(e)) continue probe = self._send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: requests = probe.requests if len(probe.user_output) > 0: request.user_output = probe.user_output # if the probe return some cookies set it has the last one if probe.cookies: Shared.end_cookies = probe.cookies else: errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if not Shared.options['use_urllib_onerror']: continue try: hr = HttpGet(request, Shared.options['process_timeout'], CrawlerThread._PROCESS_RETRIES, Shared.options['user_agent'], Shared.options['proxy']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors) Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release()
def crawl(self): while True: url = None cookies = [] requests = [] requests_to_crawl = [] redirects = 0 errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print "-->"+str(e) continue url = request.url purl = urlsplit(url) if request_depth(request) > Shared.options['max_depth'] or request_post_depth(request) > Shared.options['max_post_depth']: Shared.th_lock_db.acquire() Shared.database.save_request_response_data(request.db_id, errors=[ERROR_CRAWLDEPTH]) Shared.th_lock_db.release() continue if request.out_of_scope: continue probe = None probe = self.send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: if probe.redirect: # @todo: should redirect of the first url replace Shared.starturl ??? redirects = request.redirects + 1 reqtypes_to_crawl = [REQTYPE_LINK, REQTYPE_REDIRECT] if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']: reqtypes_to_crawl.append(REQTYPE_FORM) requests_to_crawl.extend(probe.get_requests_for_crawler(reqtypes_to_crawl)) requests = probe.requests if probe.html: request.html = probe.html else : errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if Shared.options['use_urllib_onerror'] == False: continue try: hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy']) requests = hr.get_requests() requests_to_crawl.extend(requests) except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) notify = False Shared.th_condition.acquire() for req in requests_to_crawl: if req.redirects > Shared.options['max_redirects']: errors.append(ERROR_MAXREDIRECTS) # shoud use BREAK instead... if its a redirect len(requests_to_crawl) = 1 continue if not req in Shared.requests: Shared.requests.append(req) notify = True if notify: Shared.th_condition.notifyAll() Shared.th_condition.release() Shared.th_lock_db.acquire() Shared.database.save_request_response_data(request.db_id, errors=errors, html=request.html) Shared.database.connect() for r in requests: Shared.database.save_request(r) Shared.database.close() Shared.th_lock_db.release()