def start(self): if self.count > 1 and self.url is None: print("Cannot provide page count with no URL given") exit(1) if self.remove_params and self.url is None: print( "WARNING: Must have Internet connection to remove unneeded parameters" ) # Scan for all APIs if self.url: os.makedirs(self.har_directory, exist_ok=True) self.delete_existing_hars() self.browser = Browser( "chromedriver/chromedriver", "browsermob-proxy-2.1.4/bin/browsermob-proxy", self.har_directory, cookies=self.cookies) if self.search_string is not None: print("Searching URL " + self.url + " for string " + self.search_string) # Move recursively through the site api_calls = self.crawling_scan(self.url) # Scan directory of har files else: print("Parsing existing directory of har files") har_parser = HarParser(self.har_directory, self.search_string, self.remove_params) api_calls = har_parser.parse_multiple_hars() if self.browser is not None: self.browser.close() return api_calls
def crawlingScan(self, url, apiCalls = [], allFoundURLs = []): self.count = self.count - 1 if self.count < 0: return harParser = HarParser(self.harDirectory, searchString=self.searchString, removeParams=self.removeParams) #If uncommented, will return as soon as a matching call is found #if self.searchString is not None and len(apiCalls) > 0: # return apiCalls try: print("Scanning URL: "+url) html = self.openURL(url) if html is not None: bsObj = BeautifulSoup(html, "lxml") harObj = harParser.getSingleHarFile() apiCalls = harParser.scanHarfile(harObj, apiCalls=apiCalls) allFoundURLs, newUrls = self.findInternalURLs(bsObj, url, allFoundURLs) shuffle(newUrls) for newUrl in newUrls: self.crawlingScan(newUrl, apiCalls, allFoundURLs) except (KeyboardInterrupt, SystemExit): print("Stopping crawl") self.browser.close() apiWriter = APIWriter(apiCalls) apiWriter.outputAPIs() exit(1) return apiCalls
def start(self): if self.count > 1 and self.url is None: print("Cannot provide page count with no URL given") exit(1) if self.removeParams and self.url is None: print("WARNING: Must have Internet connection to remove unneeded parameters") #Scan for all APIs if self.url: os.makedirs(self.harDirectory,exist_ok=True) self.deleteExistingHars() self.browser = Browser("chromedriver/chromedriver", "browsermob-proxy-2.1.4/bin/browsermob-proxy", self.harDirectory, cookies=self.cookies) if self.searchString is not None: print("Searching URL "+self.url+" for string "+self.searchString) #Move recursively through the site apiCalls = self.crawlingScan(self.url) #Scan directory of har files else: print("Parsing existing directory of har files") harParser = HarParser(self.harDirectory, self.searchString, self.removeParams) apiCalls = harParser.parseMultipleHars() if self.browser is not None: self.browser.close() return apiCalls
def crawlingScan(self, url, apiCalls=[], allFoundURLs=[]): self.count = self.count - 1 if self.count < 0: return harParser = HarParser(self.harDirectory, searchString=self.searchString, removeParams=self.removeParams) #If uncommented, will return as soon as a matching call is found #if self.searchString is not None and len(apiCalls) > 0: # return apiCalls try: print("Scanning URL: " + url) html = self.openURL(url) if html is not None: bsObj = BeautifulSoup(html, "lxml") harObj = harParser.getSingleHarFile() apiCalls = harParser.scanHarfile(harObj, apiCalls=apiCalls) allFoundURLs, newUrls = self.findInternalURLs( bsObj, url, allFoundURLs) shuffle(newUrls) for newUrl in newUrls: self.crawlingScan(newUrl, apiCalls, allFoundURLs) except (KeyboardInterrupt, SystemExit): print("Stopping crawl") self.browser.close() apiWriter = APIWriter(apiCalls) apiWriter.outputAPIs() exit(1) return apiCalls
def crawling_scan(self, url, api_calls=None, all_found_urls=None): if api_calls is None: api_calls = [] if all_found_urls is None: all_found_urls = [] self.count = self.count - 1 if self.count < 0: return har_parser = HarParser(self.har_directory, search_string=self.search_string, remove_params=self.remove_params) # If uncommented, will return as soon as a matching call is found # if self.search_string is not None and len(apiCalls) > 0: # return apiCalls try: print("Scanning URL: " + url) html = self.open_url(url) if html is not None: soup = BeautifulSoup(html, "lxml") har_obj = har_parser.get_single_har_file() api_calls = har_parser.scan_har_file(har_obj, api_calls=api_calls) all_found_urls, new_urls = self.find_internal_urls( soup, url, all_found_urls) shuffle(new_urls) for newUrl in new_urls: self.crawling_scan(newUrl, api_calls, all_found_urls) except (KeyboardInterrupt, SystemExit): print("Stopping crawl") self.browser.close() api_writer = APIWriter(api_calls) api_writer.output_apis() sys.exit(1) return api_calls