コード例 #1
0
    def start(self):
        if self.count > 1 and self.url is None:
            print("Cannot provide page count with no URL given")
            exit(1)
        if self.remove_params and self.url is None:
            print(
                "WARNING: Must have Internet connection to remove unneeded parameters"
            )

        # Scan for all APIs
        if self.url:
            os.makedirs(self.har_directory, exist_ok=True)
            self.delete_existing_hars()
            self.browser = Browser(
                "chromedriver/chromedriver",
                "browsermob-proxy-2.1.4/bin/browsermob-proxy",
                self.har_directory,
                cookies=self.cookies)
            if self.search_string is not None:
                print("Searching URL " + self.url + " for string " +
                      self.search_string)
            # Move recursively through the site
            api_calls = self.crawling_scan(self.url)

        # Scan directory of har files
        else:
            print("Parsing existing directory of har files")
            har_parser = HarParser(self.har_directory, self.search_string,
                                   self.remove_params)
            api_calls = har_parser.parse_multiple_hars()

        if self.browser is not None:
            self.browser.close()

        return api_calls
コード例 #2
0
	def crawlingScan(self, url, apiCalls = [], allFoundURLs = []):
		
		self.count = self.count - 1
		if self.count < 0:
			return

		harParser = HarParser(self.harDirectory, searchString=self.searchString, removeParams=self.removeParams)

		#If uncommented, will return as soon as a matching call is found
		#if self.searchString is not None and len(apiCalls) > 0:
		#	return apiCalls
		try:
			print("Scanning URL: "+url)
			html = self.openURL(url)
			if html is not None:
				bsObj = BeautifulSoup(html, "lxml")

				harObj = harParser.getSingleHarFile()
				apiCalls = harParser.scanHarfile(harObj, apiCalls=apiCalls)

				allFoundURLs, newUrls = self.findInternalURLs(bsObj, url, allFoundURLs)
				shuffle(newUrls)
				
				for newUrl in newUrls:
					self.crawlingScan(newUrl, apiCalls, allFoundURLs)
		
		except (KeyboardInterrupt, SystemExit):
			print("Stopping crawl")
			self.browser.close()
			apiWriter = APIWriter(apiCalls)
			apiWriter.outputAPIs()
			exit(1)
		return apiCalls
コード例 #3
0
	def start(self):
		if self.count > 1 and self.url is None:
			print("Cannot provide page count with no URL given")
			exit(1)
		if self.removeParams and self.url is None:
			print("WARNING: Must have Internet connection to remove unneeded parameters")

		#Scan for all APIs
		if self.url:
			os.makedirs(self.harDirectory,exist_ok=True)
			self.deleteExistingHars()
			self.browser = Browser("chromedriver/chromedriver", "browsermob-proxy-2.1.4/bin/browsermob-proxy", self.harDirectory, cookies=self.cookies)
			if self.searchString is not None:
				print("Searching URL "+self.url+" for string "+self.searchString)
			#Move recursively through the site
			apiCalls = self.crawlingScan(self.url)
			
		#Scan directory of har files
		else:
			print("Parsing existing directory of har files")
			harParser = HarParser(self.harDirectory, self.searchString, self.removeParams)
			apiCalls = harParser.parseMultipleHars()

		if self.browser is not None:
			self.browser.close()

		return apiCalls
コード例 #4
0
ファイル: apiFinder.py プロジェクト: Shaw622/Scraping
    def crawlingScan(self, url, apiCalls=[], allFoundURLs=[]):
        self.count = self.count - 1
        if self.count < 0:
            return

        harParser = HarParser(self.harDirectory,
                              searchString=self.searchString,
                              removeParams=self.removeParams)

        #If uncommented, will return as soon as a matching call is found
        #if self.searchString is not None and len(apiCalls) > 0:
        #	return apiCalls
        try:
            print("Scanning URL: " + url)
            html = self.openURL(url)
            if html is not None:
                bsObj = BeautifulSoup(html, "lxml")

                harObj = harParser.getSingleHarFile()
                apiCalls = harParser.scanHarfile(harObj, apiCalls=apiCalls)

                allFoundURLs, newUrls = self.findInternalURLs(
                    bsObj, url, allFoundURLs)
                shuffle(newUrls)

                for newUrl in newUrls:
                    self.crawlingScan(newUrl, apiCalls, allFoundURLs)

        except (KeyboardInterrupt, SystemExit):
            print("Stopping crawl")
            self.browser.close()
            apiWriter = APIWriter(apiCalls)
            apiWriter.outputAPIs()
            exit(1)
        return apiCalls
コード例 #5
0
    def crawling_scan(self, url, api_calls=None, all_found_urls=None):
        if api_calls is None:
            api_calls = []
        if all_found_urls is None:
            all_found_urls = []
        self.count = self.count - 1
        if self.count < 0:
            return

        har_parser = HarParser(self.har_directory,
                               search_string=self.search_string,
                               remove_params=self.remove_params)

        # If uncommented, will return as soon as a matching call is found
        # if self.search_string is not None and len(apiCalls) > 0:
        # 	return apiCalls
        try:
            print("Scanning URL: " + url)
            html = self.open_url(url)
            if html is not None:
                soup = BeautifulSoup(html, "lxml")

                har_obj = har_parser.get_single_har_file()
                api_calls = har_parser.scan_har_file(har_obj,
                                                     api_calls=api_calls)

                all_found_urls, new_urls = self.find_internal_urls(
                    soup, url, all_found_urls)
                shuffle(new_urls)

                for newUrl in new_urls:
                    self.crawling_scan(newUrl, api_calls, all_found_urls)

        except (KeyboardInterrupt, SystemExit):
            print("Stopping crawl")
            self.browser.close()
            api_writer = APIWriter(api_calls)
            api_writer.output_apis()
            sys.exit(1)
        return api_calls