Example #1
0
    def show_menu(self):
        """
        prints out the menu
        :return: the selected option
        """
        if self.username is "" or self.userloc is "" or self.logpath is "":
            self.username = raw_input("Set username: "******"Set user location: ")
            self.logpath = raw_input("Set full log path:")
            self.entitie = raw_input(
                "Do you want entity extraction instead of text? y/N")
            if self.entitie == "Y" or self.entitie == 'y':
                self.entitie = True
        print ""
        # limit this options if no image is set

        print "1. Scrape website"
        print "2. Run scraper on gobuster output."
        print "3. Enter file with hrefs."
        print "4. exit\n"

        try:
            optionnumber = raw_input("Choose an option number: \n")
            # user input handling
            if optionnumber.isdigit(
            ) and int(optionnumber) > 0 and int(optionnumber) < 5:
                return int(optionnumber)

        except ExceptionHandling.WrongOptionNumber as e:
            Logging.error_log("Menu", e.message)
            print "\033[93m" + e.message + "\033[0m"
            pass
Example #2
0
    def scraper(self, site, user, userloc, logpath, hrefCheck, entitie):
        """
        This function scrapes the pages.
        :param site: Url from site
        :param user: User who uses the scraper
        :param userloc: Location of the current user
        :param logpath: Path of the log file.
        :param hrefCheck: Check if its the first time in the scraper.
        :param entitie: Check if the entities need to be extracted.
        :return: The filename and the SHA 256 value.
        """
        text = ""
        href = []
        what = str(site + " scrapen.")
        when = time.strftime("%d/%m/%Y" + " " + "%H:%M:%S")
        why = "Extract text from the site for research."
        result = str(
            site +
            " gescraped. .txt file has been made with the content of the original site."
        )
        if str(site).endswith("\n"):
            site = site[:-1]
        try:
            print site
            scp = Scp()
            logging = Logging()
            HrefParser = hrefParser()
            if site.__contains__(".pdf"):

                # Download pdf and push it to the server
                if site.__contains__("www."):
                    domain = site.split("www.")
                else:
                    domain = site.split("://")
                tld = str(domain[1])
                tld = tld.replace("/", "-")
                filename = "sites/" + tld
                # scraper = Scraper.scraper()
                self.download_file(site, filename)
                hex_dig = self.get_hashes(filename)
                scp.run(filename)
                # Write logging to .csv file.

                logging.log(user, userloc, when, what, why, result, hex_dig,
                            logpath)
                return (filename, hex_dig)

            else:

                # Download the page
                page = requests.get(site)
                soup = BeautifulSoup(page.content, 'html.parser')

                # Extract all P tags
                for x, y in enumerate(soup.find_all('p')):
                    text = text + soup.find_all('p')[x].get_text()
                # Extract all href's
                for a in soup.find_all('a', href=True):
                    href.append(a['href'])

                # Parse text to unicode.
                unitext = unidecode(text)
                if site.__contains__("www."):
                    domain = site.split("www.")
                else:
                    domain = site.split("://")

                # Write text to .txt file
                filename = self.get_filname(domain, unitext)
                hex_dig = self.get_hashes(filename)
                if entitie:
                    self.get_entities(filename, domain)
                scp.run(filename)
                logging.log(user, userloc, when, what, why, result, hex_dig,
                            logpath)

                # Check if its the first scan.
                if hrefCheck == True and entitie == False:
                    HrefParser.parser(href, str(domain[1]))
                print "SHA 256 : " + hex_dig + "\n"

                return

        except ExceptionHandling.WrongStatusCode as e:
            Logging.error_log("Menu", e.message)
            print "\033[93m" + e.message + "\033[0m"
            pass