def get_requests_from_robots(self, request): purl = urlsplit(request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url) try: # request, timeout, retries=None, useragent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib2.HTTPError: return [] except: raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*","",line).split(":",1) except: continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request) requests.append(req) return adjust_requests(requests) if requests else []
def get_requests_from_robots(self, request): purl = urlsplit(request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url, extra_headers=Shared.options['extra_headers']) try: # request, timeout, retries=None, useragent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib.error.HTTPError: return [] except: return [] #raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*", "", line).split(":", 1) except: continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request) requests.append(req) return adjust_requests(requests) if requests else []
def crawl(self): while True: url = None cookies = [] requests = [] requests_to_crawl = [] redirects = 0 errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print "-->" + str(e) continue url = request.url purl = urlsplit(url) probe = None probe = self.send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: requests = probe.requests if probe.html: request.html = probe.html else: errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if Shared.options['use_urllib_onerror'] == False: continue try: hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors) Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release()
def crawl(self): while True: url = None cookies = [] requests = [] requests_to_crawl = [] redirects = 0 errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print "-->"+str(e) continue url = request.url purl = urlsplit(url) probe = None probe = self.send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: requests = probe.requests if probe.html: request.html = probe.html else : errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if Shared.options['use_urllib_onerror'] == False: continue try: hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors) Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release()
def crawl(self): while True: if self.exit: return requests = [] errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print("crawl_thread err -->" + e) continue probe = self.send_probe(request, errors) if probe: requests = probe.requests if probe.html: request.html = probe.html if probe.page_hash: request.page_hash = probe.page_hash if len(probe.user_output) > 0: request.user_output = probe.user_output errors.append(probe.errmessage) else: errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if not Shared.options['use_urllib_onerror']: continue try: hr = HttpGet(request, Shared.options['process_timeout'], 1, Shared.options['useragent'], Shared.options['proxy'], Shared.options['extra_headers']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) requests = adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors, probe.page_hash if probe else "") Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release() self.wait_pause()
def send(self, req_timeout=5, ignore_errors=False): http = HttpGet(self.request, req_timeout, proxy=self._scanner.proxy, useragent=self._scanner.user_agent, extra_headers=self._scanner.extra_headers) cookies = [] if self._scanner.cookies: cookies.extend(self._scanner.cookies) if self.cookie: cookies.extend(self.cookie) #cookies = self.cookie + (self._scanner.cookies if self._scanner.cookies else []) resp = http.send_request(method=self.method, url=self.url, data=self.body, cookies=cookies, ignore_errors=ignore_errors) return MutationResponse(self, resp)
def check_startrequest(self, request): h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy']) try: h.get_requests() except NotHtmlException: print "\nError: Document is not html" sys.exit(1) except Exception as e: print "\nError: unable to open url: %s" % e sys.exit(1)
def _crawl(self): while True: requests = [] errors = [] try: request = self._wait_request() except ThreadExitRequestException: if os.path.exists(self._cookie_file): os.remove(self._cookie_file) return except Exception as e: print("-->" + str(e)) continue probe = self._send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: requests = probe.requests if len(probe.user_output) > 0: request.user_output = probe.user_output # if the probe return some cookies set it has the last one if probe.cookies: Shared.end_cookies = probe.cookies else: errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if not Shared.options['use_urllib_onerror']: continue try: hr = HttpGet(request, Shared.options['process_timeout'], CrawlerThread._PROCESS_RETRIES, Shared.options['user_agent'], Shared.options['proxy']) requests = hr.get_requests() except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) Shared.main_condition.acquire() res = CrawlResult(request, requests, errors) Shared.crawl_results.append(res) Shared.main_condition.notify() Shared.main_condition.release()
def _check_request(request): """ check if the given request resolve and return proper html file :param request: :return: """ h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['user_agent'], Shared.options['proxy']) try: h.get_requests() except NotHtmlException: print("\nError: Document is not html") sys.exit(1) except Exception as e: print("\nError: unable to open url: %s" % e) sys.exit(1)
def send_request(self, req, url=None, method=None, data=None, cookies=None, user_agent=None, proxy=None, extra_headers=None, req_timeout=5, ignore_errors=False): if not proxy: proxy = self.proxy if not user_agent: user_agent = self.user_agent if not extra_headers: extra_headers = self.extra_headers http = HttpGet(req, req_timeout, proxy=proxy, useragent=user_agent, extra_headers=extra_headers ) #{"proto":"http", "host":"127.0.0.1","port":"8080"}) #http = HttpGet(self.request, req_timeout) allcookies = [] if cookies: allcookies.extend(cookies) if self.cookies: allcookies.extend(self.cookies) http = HttpGet(req, req_timeout, proxy=proxy, useragent=user_agent, extra_headers=extra_headers) return http.send_request(method=method, url=url, data=data, cookies=allcookies, ignore_errors=ignore_errors)
def _get_requests_from_robots(start_request): """ read robots.txt file (if any) and create a list of request based on it's content :return: list of request """ purl = urlsplit(start_request.url) url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) getreq = Request(REQTYPE_LINK, "GET", url) try: # request, timeout, retries=None, user_agent=None, proxy=None): httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) lines = httpget.get_file().split("\n") except urllib2.HTTPError: return [] except: raise requests = [] for line in lines: directive = "" url = None try: directive, url = re.sub("\#.*", "", line).split(":", 1) except Exception as e: print(str(e)) continue # ignore errors if re.match("(dis)?allow", directive.strip(), re.I): req = Request(REQTYPE_LINK, "GET", url.strip(), parent=start_request) if request_is_crawlable(req): requests.append(req) return adjust_requests(requests) if requests else []
def rawsend(self, url, method=None, data=None, cookies=None, user_agent=None, proxy=None, extra_headers=None, req_timeout=5, ignore_errors=False): if not method: method = METHOD_GET req = Request(REQTYPE_LINK, method, url) http = HttpGet(req, req_timeout, proxy=proxy, useragent=user_agent, extra_headers=extra_headers) return http.send_request(method=method, url=url, data=data, cookies=cookies, ignore_errors=ignore_errors)
def crawl(self): while True: url = None cookies = [] requests = [] requests_to_crawl = [] redirects = 0 errors = [] try: request = self.wait_request() except ThreadExitRequestException: if os.path.exists(self.cookie_file): os.remove(self.cookie_file) return except Exception as e: print "-->"+str(e) continue url = request.url purl = urlsplit(url) if request_depth(request) > Shared.options['max_depth'] or request_post_depth(request) > Shared.options['max_post_depth']: Shared.th_lock_db.acquire() Shared.database.save_request_response_data(request.db_id, errors=[ERROR_CRAWLDEPTH]) Shared.th_lock_db.release() continue if request.out_of_scope: continue probe = None probe = self.send_probe(request, errors) if probe: if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: if probe.redirect: # @todo: should redirect of the first url replace Shared.starturl ??? redirects = request.redirects + 1 reqtypes_to_crawl = [REQTYPE_LINK, REQTYPE_REDIRECT] if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']: reqtypes_to_crawl.append(REQTYPE_FORM) requests_to_crawl.extend(probe.get_requests_for_crawler(reqtypes_to_crawl)) requests = probe.requests if probe.html: request.html = probe.html else : errors.append(ERROR_PROBEFAILURE) # get urls with python to continue crawling if Shared.options['use_urllib_onerror'] == False: continue try: hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy']) requests = hr.get_requests() requests_to_crawl.extend(requests) except Exception as e: errors.append(str(e)) # set out_of_scope, apply user-supplied filters to urls (ie group_qs) adjust_requests(requests) notify = False Shared.th_condition.acquire() for req in requests_to_crawl: if req.redirects > Shared.options['max_redirects']: errors.append(ERROR_MAXREDIRECTS) # shoud use BREAK instead... if its a redirect len(requests_to_crawl) = 1 continue if not req in Shared.requests: Shared.requests.append(req) notify = True if notify: Shared.th_condition.notifyAll() Shared.th_condition.release() Shared.th_lock_db.acquire() Shared.database.save_request_response_data(request.db_id, errors=errors, html=request.html) Shared.database.connect() for r in requests: Shared.database.save_request(r) Shared.database.close() Shared.th_lock_db.release()