Example #1
0
	def get_requests_from_robots(self, request):
		purl = urlsplit(request.url)		
		url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)
		
		getreq = Request(REQTYPE_LINK, "GET", url)
		try:		
			# request, timeout, retries=None, useragent=None, proxy=None):
			httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy'])			
			lines = httpget.get_file().split("\n")
		except urllib2.HTTPError:
			return []
		except:
			raise

		requests = []
		for line in lines:			
			directive = ""
			url = None
			try:
				directive, url = re.sub("\#.*","",line).split(":",1)
			except:
				continue # ignore errors

			if re.match("(dis)?allow", directive.strip(), re.I):
				req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request)			
				requests.append(req)


		return adjust_requests(requests) if requests else []
Example #2
0
    def get_requests_from_robots(self, request):
        purl = urlsplit(request.url)
        url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)

        getreq = Request(REQTYPE_LINK,
                         "GET",
                         url,
                         extra_headers=Shared.options['extra_headers'])
        try:
            # request, timeout, retries=None, useragent=None, proxy=None):
            httpget = HttpGet(getreq, 10, 1, "Googlebot",
                              Shared.options['proxy'])
            lines = httpget.get_file().split("\n")
        except urllib.error.HTTPError:
            return []
        except:
            return []
            #raise

        requests = []
        for line in lines:
            directive = ""
            url = None
            try:
                directive, url = re.sub("\#.*", "", line).split(":", 1)
            except:
                continue  # ignore errors

            if re.match("(dis)?allow", directive.strip(), re.I):
                req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request)
                requests.append(req)

        return adjust_requests(requests) if requests else []
Example #3
0
    def crawl(self):

        while True:
            url = None
            cookies = []
            requests = []

            requests_to_crawl = []
            redirects = 0
            errors = []

            try:
                request = self.wait_request()
            except ThreadExitRequestException:
                if os.path.exists(self.cookie_file):
                    os.remove(self.cookie_file)
                return
            except Exception as e:
                print "-->" + str(e)
                continue

            url = request.url

            purl = urlsplit(url)

            probe = None

            probe = self.send_probe(request, errors)

            if probe:
                if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:

                    requests = probe.requests

                    if probe.html:
                        request.html = probe.html

            else:
                errors.append(ERROR_PROBEFAILURE)
                # get urls with python to continue crawling
                if Shared.options['use_urllib_onerror'] == False:
                    continue
                try:
                    hr = HttpGet(request, Shared.options['process_timeout'],
                                 self.process_retries,
                                 Shared.options['useragent'],
                                 Shared.options['proxy'])
                    requests = hr.get_requests()
                except Exception as e:
                    errors.append(str(e))

            # set out_of_scope, apply user-supplied filters to urls (ie group_qs)
            adjust_requests(requests)

            Shared.main_condition.acquire()
            res = CrawlResult(request, requests, errors)
            Shared.crawl_results.append(res)
            Shared.main_condition.notify()
            Shared.main_condition.release()
Example #4
0
	def crawl(self):	
		
		while True:
			url = None
			cookies = []							
			requests = []

			requests_to_crawl = []
			redirects = 0
			errors = []

			try:				
				request = self.wait_request()								
			except ThreadExitRequestException:				
				if os.path.exists(self.cookie_file):
					os.remove(self.cookie_file)
				return
			except Exception as e:
				print "-->"+str(e)
				continue

			url = request.url

			purl = urlsplit(url)


			probe = None

			probe = self.send_probe(request, errors)			

			if probe:
				if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:
					
					requests = probe.requests

					if probe.html:
						request.html = probe.html													

			else :
				errors.append(ERROR_PROBEFAILURE)
				# get urls with python to continue crawling								
				if Shared.options['use_urllib_onerror'] == False:
					continue
				try:		
					hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy'])			
					requests = hr.get_requests()					
				except Exception as e:					
					errors.append(str(e))
				
			
			# set out_of_scope, apply user-supplied filters to urls (ie group_qs)			
			adjust_requests(requests)
			
			Shared.main_condition.acquire()
			res = CrawlResult(request, requests, errors)
			Shared.crawl_results.append(res)
			Shared.main_condition.notify()
			Shared.main_condition.release()
Example #5
0
    def crawl(self):

        while True:
            if self.exit:
                return

            requests = []
            errors = []

            try:
                request = self.wait_request()
            except ThreadExitRequestException:
                if os.path.exists(self.cookie_file):
                    os.remove(self.cookie_file)
                return
            except Exception as e:
                print("crawl_thread err -->" + e)
                continue

            probe = self.send_probe(request, errors)

            if probe:
                requests = probe.requests
                if probe.html:
                    request.html = probe.html
                if probe.page_hash:
                    request.page_hash = probe.page_hash
                if len(probe.user_output) > 0:
                    request.user_output = probe.user_output
                errors.append(probe.errmessage)

            else:
                errors.append(ERROR_PROBEFAILURE)
                # get urls with python to continue crawling
                if not Shared.options['use_urllib_onerror']:
                    continue
                try:
                    hr = HttpGet(request, Shared.options['process_timeout'], 1,
                                 Shared.options['useragent'],
                                 Shared.options['proxy'],
                                 Shared.options['extra_headers'])
                    requests = hr.get_requests()
                except Exception as e:
                    errors.append(str(e))

            # set out_of_scope, apply user-supplied filters to urls (ie group_qs)
            requests = adjust_requests(requests)

            Shared.main_condition.acquire()
            res = CrawlResult(request, requests, errors,
                              probe.page_hash if probe else "")
            Shared.crawl_results.append(res)
            Shared.main_condition.notify()
            Shared.main_condition.release()

            self.wait_pause()
Example #6
0
	def send(self, req_timeout=5, ignore_errors=False):
		http = HttpGet(self.request, req_timeout, proxy=self._scanner.proxy, useragent=self._scanner.user_agent, extra_headers=self._scanner.extra_headers)
		cookies = []
		if self._scanner.cookies:
			cookies.extend(self._scanner.cookies)
		if self.cookie:
			cookies.extend(self.cookie)
		#cookies = self.cookie + (self._scanner.cookies if self._scanner.cookies else [])
		resp =  http.send_request(method=self.method, url=self.url, data=self.body, cookies=cookies, ignore_errors=ignore_errors)
		return MutationResponse(self, resp)
Example #7
0
	def check_startrequest(self, request):		

		h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy'])
		try:
			h.get_requests()
		except NotHtmlException:
			print "\nError: Document is not html"
			sys.exit(1)
		except Exception as e:
			print "\nError: unable to open url: %s" % e
			sys.exit(1)
Example #8
0
	def check_startrequest(self, request):

		h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy'])
		try:
			h.get_requests()
		except NotHtmlException:
			print "\nError: Document is not html"
			sys.exit(1)
		except Exception as e:
			print "\nError: unable to open url: %s" % e
			sys.exit(1)
Example #9
0
    def _crawl(self):

        while True:
            requests = []
            errors = []

            try:
                request = self._wait_request()
            except ThreadExitRequestException:
                if os.path.exists(self._cookie_file):
                    os.remove(self._cookie_file)
                return
            except Exception as e:
                print("-->" + str(e))
                continue

            probe = self._send_probe(request, errors)

            if probe:
                if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:

                    requests = probe.requests
                    if len(probe.user_output) > 0:
                        request.user_output = probe.user_output

                    # if the probe return some cookies set it has the last one
                    if probe.cookies:
                        Shared.end_cookies = probe.cookies

            else:
                errors.append(ERROR_PROBEFAILURE)
                # get urls with python to continue crawling
                if not Shared.options['use_urllib_onerror']:
                    continue
                try:
                    hr = HttpGet(request, Shared.options['process_timeout'],
                                 CrawlerThread._PROCESS_RETRIES,
                                 Shared.options['user_agent'],
                                 Shared.options['proxy'])
                    requests = hr.get_requests()
                except Exception as e:
                    errors.append(str(e))

            # set out_of_scope, apply user-supplied filters to urls (ie group_qs)
            adjust_requests(requests)

            Shared.main_condition.acquire()
            res = CrawlResult(request, requests, errors)
            Shared.crawl_results.append(res)
            Shared.main_condition.notify()
            Shared.main_condition.release()
Example #10
0
 def _check_request(request):
     """
     check if the given request resolve and return proper html file
     :param request:
     :return:
     """
     h = HttpGet(request, Shared.options['process_timeout'], 2,
                 Shared.options['user_agent'], Shared.options['proxy'])
     try:
         h.get_requests()
     except NotHtmlException:
         print("\nError: Document is not html")
         sys.exit(1)
     except Exception as e:
         print("\nError: unable to open url: %s" % e)
         sys.exit(1)
Example #11
0
	def send_request(self, req, url=None, method=None, data=None, cookies=None, user_agent=None, proxy=None, extra_headers=None, req_timeout=5, ignore_errors=False):
		if not proxy:
			proxy = self.proxy
		if not user_agent:
			user_agent = self.user_agent
		if not extra_headers:
			extra_headers = self.extra_headers

		http = HttpGet(req, req_timeout, proxy=proxy, useragent=user_agent, extra_headers=extra_headers ) #{"proto":"http", "host":"127.0.0.1","port":"8080"})
		#http = HttpGet(self.request, req_timeout)
		allcookies = []
		if cookies:
			allcookies.extend(cookies)
		if self.cookies:
			allcookies.extend(self.cookies)
		http = HttpGet(req, req_timeout, proxy=proxy, useragent=user_agent, extra_headers=extra_headers)
		return  http.send_request(method=method, url=url, data=data, cookies=allcookies, ignore_errors=ignore_errors)
Example #12
0
    def _get_requests_from_robots(start_request):
        """
        read robots.txt file (if any) and create a list of request based on it's content

        :return: list of request
        """
        purl = urlsplit(start_request.url)
        url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)

        getreq = Request(REQTYPE_LINK, "GET", url)
        try:
            # request, timeout, retries=None, user_agent=None, proxy=None):
            httpget = HttpGet(getreq, 10, 1, "Googlebot",
                              Shared.options['proxy'])
            lines = httpget.get_file().split("\n")
        except urllib2.HTTPError:
            return []
        except:
            raise

        requests = []
        for line in lines:
            directive = ""
            url = None
            try:
                directive, url = re.sub("\#.*", "", line).split(":", 1)
            except Exception as e:
                print(str(e))
                continue  # ignore errors

            if re.match("(dis)?allow", directive.strip(), re.I):
                req = Request(REQTYPE_LINK,
                              "GET",
                              url.strip(),
                              parent=start_request)
                if request_is_crawlable(req):
                    requests.append(req)

        return adjust_requests(requests) if requests else []
Example #13
0
 def rawsend(self,
             url,
             method=None,
             data=None,
             cookies=None,
             user_agent=None,
             proxy=None,
             extra_headers=None,
             req_timeout=5,
             ignore_errors=False):
     if not method:
         method = METHOD_GET
     req = Request(REQTYPE_LINK, method, url)
     http = HttpGet(req,
                    req_timeout,
                    proxy=proxy,
                    useragent=user_agent,
                    extra_headers=extra_headers)
     return http.send_request(method=method,
                              url=url,
                              data=data,
                              cookies=cookies,
                              ignore_errors=ignore_errors)
	def crawl(self):	
		
		while True:
			url = None
			cookies = []							
			requests = []

			requests_to_crawl = []
			redirects = 0
			errors = []

			try:				
				request = self.wait_request()								
			except ThreadExitRequestException:
				if os.path.exists(self.cookie_file):
					os.remove(self.cookie_file)
				return
			except Exception as e:
				print "-->"+str(e)
				continue

			url = request.url

			purl = urlsplit(url)


			if request_depth(request) > Shared.options['max_depth'] or request_post_depth(request) > Shared.options['max_post_depth']:				
				Shared.th_lock_db.acquire()				
				Shared.database.save_request_response_data(request.db_id, errors=[ERROR_CRAWLDEPTH])
				Shared.th_lock_db.release()
				continue

			if request.out_of_scope:
				continue				

			probe = None

			probe = self.send_probe(request, errors)			

			if probe:
				if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:
				
					if probe.redirect:																
						# @todo: should redirect of the first url replace Shared.starturl ???																		
						redirects = request.redirects + 1

					reqtypes_to_crawl = [REQTYPE_LINK, REQTYPE_REDIRECT]
					if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']:
						reqtypes_to_crawl.append(REQTYPE_FORM)

					requests_to_crawl.extend(probe.get_requests_for_crawler(reqtypes_to_crawl))
				
					requests = probe.requests

					if probe.html:
						request.html = probe.html													

			else :
				errors.append(ERROR_PROBEFAILURE)
				# get urls with python to continue crawling								
				if Shared.options['use_urllib_onerror'] == False:
					continue
				try:		
					hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy'])			
					requests = hr.get_requests()
					requests_to_crawl.extend(requests)					
				except Exception as e:					
					errors.append(str(e))
				

			# set out_of_scope, apply user-supplied filters to urls (ie group_qs)
			adjust_requests(requests)

			notify = False
			Shared.th_condition.acquire()
			for req in requests_to_crawl:								
				if req.redirects > Shared.options['max_redirects']:
					errors.append(ERROR_MAXREDIRECTS)
					# shoud use BREAK instead... if its a redirect len(requests_to_crawl) = 1
					continue
			
				if not req in Shared.requests:
					Shared.requests.append(req)
					notify = True

			if notify:
				Shared.th_condition.notifyAll() 
	
			Shared.th_condition.release()


			Shared.th_lock_db.acquire()
			Shared.database.save_request_response_data(request.db_id, errors=errors, html=request.html)
			Shared.database.connect()
			for r in requests:											
				Shared.database.save_request(r)
			Shared.database.close()
			
			Shared.th_lock_db.release()