def show_menu(self): """ prints out the menu :return: the selected option """ if self.username is "" or self.userloc is "" or self.logpath is "": self.username = raw_input("Set username: "******"Set user location: ") self.logpath = raw_input("Set full log path:") self.entitie = raw_input( "Do you want entity extraction instead of text? y/N") if self.entitie == "Y" or self.entitie == 'y': self.entitie = True print "" # limit this options if no image is set print "1. Scrape website" print "2. Run scraper on gobuster output." print "3. Enter file with hrefs." print "4. exit\n" try: optionnumber = raw_input("Choose an option number: \n") # user input handling if optionnumber.isdigit( ) and int(optionnumber) > 0 and int(optionnumber) < 5: return int(optionnumber) except ExceptionHandling.WrongOptionNumber as e: Logging.error_log("Menu", e.message) print "\033[93m" + e.message + "\033[0m" pass
def scraper(self, site, user, userloc, logpath, hrefCheck, entitie): """ This function scrapes the pages. :param site: Url from site :param user: User who uses the scraper :param userloc: Location of the current user :param logpath: Path of the log file. :param hrefCheck: Check if its the first time in the scraper. :param entitie: Check if the entities need to be extracted. :return: The filename and the SHA 256 value. """ text = "" href = [] what = str(site + " scrapen.") when = time.strftime("%d/%m/%Y" + " " + "%H:%M:%S") why = "Extract text from the site for research." result = str( site + " gescraped. .txt file has been made with the content of the original site." ) if str(site).endswith("\n"): site = site[:-1] try: print site scp = Scp() logging = Logging() HrefParser = hrefParser() if site.__contains__(".pdf"): # Download pdf and push it to the server if site.__contains__("www."): domain = site.split("www.") else: domain = site.split("://") tld = str(domain[1]) tld = tld.replace("/", "-") filename = "sites/" + tld # scraper = Scraper.scraper() self.download_file(site, filename) hex_dig = self.get_hashes(filename) scp.run(filename) # Write logging to .csv file. logging.log(user, userloc, when, what, why, result, hex_dig, logpath) return (filename, hex_dig) else: # Download the page page = requests.get(site) soup = BeautifulSoup(page.content, 'html.parser') # Extract all P tags for x, y in enumerate(soup.find_all('p')): text = text + soup.find_all('p')[x].get_text() # Extract all href's for a in soup.find_all('a', href=True): href.append(a['href']) # Parse text to unicode. unitext = unidecode(text) if site.__contains__("www."): domain = site.split("www.") else: domain = site.split("://") # Write text to .txt file filename = self.get_filname(domain, unitext) hex_dig = self.get_hashes(filename) if entitie: self.get_entities(filename, domain) scp.run(filename) logging.log(user, userloc, when, what, why, result, hex_dig, logpath) # Check if its the first scan. if hrefCheck == True and entitie == False: HrefParser.parser(href, str(domain[1])) print "SHA 256 : " + hex_dig + "\n" return except ExceptionHandling.WrongStatusCode as e: Logging.error_log("Menu", e.message) print "\033[93m" + e.message + "\033[0m" pass