Example #1
0
	def check_startrequest(self, request):

		h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy'])
		try:
			h.get_requests()
		except NotHtmlException:
			print "\nError: Document is not html"
			sys.exit(1)
		except Exception as e:
			print "\nError: unable to open url: %s" % e
			sys.exit(1)
Example #2
0
	def check_startrequest(self, request):		

		h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy'])
		try:
			h.get_requests()
		except NotHtmlException:
			print "\nError: Document is not html"
			sys.exit(1)
		except Exception as e:
			print "\nError: unable to open url: %s" % e
			sys.exit(1)
Example #3
0
 def _check_request(request):
     """
     check if the given request resolve and return proper html file
     :param request:
     :return:
     """
     h = HttpGet(request, Shared.options['process_timeout'], 2,
                 Shared.options['user_agent'], Shared.options['proxy'])
     try:
         h.get_requests()
     except NotHtmlException:
         print("\nError: Document is not html")
         sys.exit(1)
     except Exception as e:
         print("\nError: unable to open url: %s" % e)
         sys.exit(1)
Example #4
0
    def crawl(self):

        while True:
            url = None
            cookies = []
            requests = []

            requests_to_crawl = []
            redirects = 0
            errors = []

            try:
                request = self.wait_request()
            except ThreadExitRequestException:
                if os.path.exists(self.cookie_file):
                    os.remove(self.cookie_file)
                return
            except Exception as e:
                print "-->" + str(e)
                continue

            url = request.url

            purl = urlsplit(url)

            probe = None

            probe = self.send_probe(request, errors)

            if probe:
                if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:

                    requests = probe.requests

                    if probe.html:
                        request.html = probe.html

            else:
                errors.append(ERROR_PROBEFAILURE)
                # get urls with python to continue crawling
                if Shared.options['use_urllib_onerror'] == False:
                    continue
                try:
                    hr = HttpGet(request, Shared.options['process_timeout'],
                                 self.process_retries,
                                 Shared.options['useragent'],
                                 Shared.options['proxy'])
                    requests = hr.get_requests()
                except Exception as e:
                    errors.append(str(e))

            # set out_of_scope, apply user-supplied filters to urls (ie group_qs)
            adjust_requests(requests)

            Shared.main_condition.acquire()
            res = CrawlResult(request, requests, errors)
            Shared.crawl_results.append(res)
            Shared.main_condition.notify()
            Shared.main_condition.release()
Example #5
0
	def crawl(self):	
		
		while True:
			url = None
			cookies = []							
			requests = []

			requests_to_crawl = []
			redirects = 0
			errors = []

			try:				
				request = self.wait_request()								
			except ThreadExitRequestException:				
				if os.path.exists(self.cookie_file):
					os.remove(self.cookie_file)
				return
			except Exception as e:
				print "-->"+str(e)
				continue

			url = request.url

			purl = urlsplit(url)


			probe = None

			probe = self.send_probe(request, errors)			

			if probe:
				if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:
					
					requests = probe.requests

					if probe.html:
						request.html = probe.html													

			else :
				errors.append(ERROR_PROBEFAILURE)
				# get urls with python to continue crawling								
				if Shared.options['use_urllib_onerror'] == False:
					continue
				try:		
					hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy'])			
					requests = hr.get_requests()					
				except Exception as e:					
					errors.append(str(e))
				
			
			# set out_of_scope, apply user-supplied filters to urls (ie group_qs)			
			adjust_requests(requests)
			
			Shared.main_condition.acquire()
			res = CrawlResult(request, requests, errors)
			Shared.crawl_results.append(res)
			Shared.main_condition.notify()
			Shared.main_condition.release()
Example #6
0
    def crawl(self):

        while True:
            if self.exit:
                return

            requests = []
            errors = []

            try:
                request = self.wait_request()
            except ThreadExitRequestException:
                if os.path.exists(self.cookie_file):
                    os.remove(self.cookie_file)
                return
            except Exception as e:
                print("crawl_thread err -->" + e)
                continue

            probe = self.send_probe(request, errors)

            if probe:
                requests = probe.requests
                if probe.html:
                    request.html = probe.html
                if probe.page_hash:
                    request.page_hash = probe.page_hash
                if len(probe.user_output) > 0:
                    request.user_output = probe.user_output
                errors.append(probe.errmessage)

            else:
                errors.append(ERROR_PROBEFAILURE)
                # get urls with python to continue crawling
                if not Shared.options['use_urllib_onerror']:
                    continue
                try:
                    hr = HttpGet(request, Shared.options['process_timeout'], 1,
                                 Shared.options['useragent'],
                                 Shared.options['proxy'],
                                 Shared.options['extra_headers'])
                    requests = hr.get_requests()
                except Exception as e:
                    errors.append(str(e))

            # set out_of_scope, apply user-supplied filters to urls (ie group_qs)
            requests = adjust_requests(requests)

            Shared.main_condition.acquire()
            res = CrawlResult(request, requests, errors,
                              probe.page_hash if probe else "")
            Shared.crawl_results.append(res)
            Shared.main_condition.notify()
            Shared.main_condition.release()

            self.wait_pause()
Example #7
0
    def _crawl(self):

        while True:
            requests = []
            errors = []

            try:
                request = self._wait_request()
            except ThreadExitRequestException:
                if os.path.exists(self._cookie_file):
                    os.remove(self._cookie_file)
                return
            except Exception as e:
                print("-->" + str(e))
                continue

            probe = self._send_probe(request, errors)

            if probe:
                if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:

                    requests = probe.requests
                    if len(probe.user_output) > 0:
                        request.user_output = probe.user_output

                    # if the probe return some cookies set it has the last one
                    if probe.cookies:
                        Shared.end_cookies = probe.cookies

            else:
                errors.append(ERROR_PROBEFAILURE)
                # get urls with python to continue crawling
                if not Shared.options['use_urllib_onerror']:
                    continue
                try:
                    hr = HttpGet(request, Shared.options['process_timeout'],
                                 CrawlerThread._PROCESS_RETRIES,
                                 Shared.options['user_agent'],
                                 Shared.options['proxy'])
                    requests = hr.get_requests()
                except Exception as e:
                    errors.append(str(e))

            # set out_of_scope, apply user-supplied filters to urls (ie group_qs)
            adjust_requests(requests)

            Shared.main_condition.acquire()
            res = CrawlResult(request, requests, errors)
            Shared.crawl_results.append(res)
            Shared.main_condition.notify()
            Shared.main_condition.release()
	def crawl(self):	
		
		while True:
			url = None
			cookies = []							
			requests = []

			requests_to_crawl = []
			redirects = 0
			errors = []

			try:				
				request = self.wait_request()								
			except ThreadExitRequestException:
				if os.path.exists(self.cookie_file):
					os.remove(self.cookie_file)
				return
			except Exception as e:
				print "-->"+str(e)
				continue

			url = request.url

			purl = urlsplit(url)


			if request_depth(request) > Shared.options['max_depth'] or request_post_depth(request) > Shared.options['max_post_depth']:				
				Shared.th_lock_db.acquire()				
				Shared.database.save_request_response_data(request.db_id, errors=[ERROR_CRAWLDEPTH])
				Shared.th_lock_db.release()
				continue

			if request.out_of_scope:
				continue				

			probe = None

			probe = self.send_probe(request, errors)			

			if probe:
				if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:
				
					if probe.redirect:																
						# @todo: should redirect of the first url replace Shared.starturl ???																		
						redirects = request.redirects + 1

					reqtypes_to_crawl = [REQTYPE_LINK, REQTYPE_REDIRECT]
					if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']:
						reqtypes_to_crawl.append(REQTYPE_FORM)

					requests_to_crawl.extend(probe.get_requests_for_crawler(reqtypes_to_crawl))
				
					requests = probe.requests

					if probe.html:
						request.html = probe.html													

			else :
				errors.append(ERROR_PROBEFAILURE)
				# get urls with python to continue crawling								
				if Shared.options['use_urllib_onerror'] == False:
					continue
				try:		
					hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy'])			
					requests = hr.get_requests()
					requests_to_crawl.extend(requests)					
				except Exception as e:					
					errors.append(str(e))
				

			# set out_of_scope, apply user-supplied filters to urls (ie group_qs)
			adjust_requests(requests)

			notify = False
			Shared.th_condition.acquire()
			for req in requests_to_crawl:								
				if req.redirects > Shared.options['max_redirects']:
					errors.append(ERROR_MAXREDIRECTS)
					# shoud use BREAK instead... if its a redirect len(requests_to_crawl) = 1
					continue
			
				if not req in Shared.requests:
					Shared.requests.append(req)
					notify = True

			if notify:
				Shared.th_condition.notifyAll() 
	
			Shared.th_condition.release()


			Shared.th_lock_db.acquire()
			Shared.database.save_request_response_data(request.db_id, errors=errors, html=request.html)
			Shared.database.connect()
			for r in requests:											
				Shared.database.save_request(r)
			Shared.database.close()
			
			Shared.th_lock_db.release()