コード例 #1
0
    def test_parse_robots_txt_should_return_list(self):
        """
        Test parseRobotsTxt(robotsTxtData)
        """
        invalid_types = [None, "", list(), dict()]
        for invalid_type in invalid_types:
            with self.subTest(invalid_type=invalid_type):
                robots_txt = SpiderFootHelpers.parseRobotsTxt(invalid_type)
                self.assertIsInstance(robots_txt, list)

        robots_txt = SpiderFootHelpers.parseRobotsTxt("disallow:")
        self.assertIsInstance(robots_txt, list)
        self.assertFalse(robots_txt)

        robots_txt = SpiderFootHelpers.parseRobotsTxt(
            "disallow: /disallowed/path\n")
        self.assertIsInstance(robots_txt, list)
        self.assertIn("/disallowed/path", robots_txt)
コード例 #2
0
    def spiderFrom(self, startingPoint):
        keepSpidering = True
        totalFetched = 0
        levelsTraversed = 0
        nextLinks = dict()
        targetBase = self.sf.urlBaseUrl(startingPoint)

        # Are we respecting robots.txt?
        if self.opts['robotsonly'] and targetBase not in self.robotsRules:
            robotsTxt = self.sf.fetchUrl(targetBase + '/robots.txt',
                                         timeout=self.opts['_fetchtimeout'],
                                         useragent=self.opts['_useragent'],
                                         verify=False)
            if robotsTxt['content'] is not None:
                self.sf.debug('robots.txt contents: ' + robotsTxt['content'])
                self.robotsRules[
                    targetBase] = SpiderFootHelpers.parseRobotsTxt(
                        robotsTxt['content'])

        if self.checkForStop():
            return

        # First iteration we are starting with links found on the start page
        # Iterations after that are based on links found on those pages,
        # and so on..
        links = self.processUrl(startingPoint)  # fetch first page

        if links is None:
            self.sf.debug("No links found on the first fetch!")
            return

        while keepSpidering:
            # Gets hit in the second and subsequent iterations when more links
            # are found
            if len(nextLinks) > 0:
                links = dict()

                # Fetch content from the new links
                for link in nextLinks:
                    # Always skip links we've already fetched
                    if (link in self.fetchedPages):
                        self.sf.debug("Already fetched " + link +
                                      ", skipping.")
                        continue

                    # Check if we've been asked to stop
                    if self.checkForStop():
                        return

                    self.sf.debug("Fetching fresh content from: " + link)
                    time.sleep(self.opts['pausesec'])
                    freshLinks = self.processUrl(link)
                    if freshLinks is not None:
                        links.update(freshLinks)

                    totalFetched += 1
                    if totalFetched >= self.opts['maxpages']:
                        self.sf.info("Maximum number of pages (" +
                                     str(self.opts['maxpages']) + ") reached.")
                        keepSpidering = False
                        break

            nextLinks = self.cleanLinks(links)
            self.sf.debug(f"Found links: {nextLinks}")

            # We've scanned through another layer of the site
            levelsTraversed += 1
            self.sf.debug(
                f"At level: {levelsTraversed}, Pages: {totalFetched}")
            if levelsTraversed >= self.opts['maxlevels']:
                self.sf.info(
                    f"Maximum number of levels ({self.opts['maxlevels']}) reached."
                )
                keepSpidering = False

            # We've reached the end of our journey..
            if len(nextLinks) == 0:
                self.sf.debug("No more links found to spider, finishing..")
                keepSpidering = False

            # We've been asked to stop scanning
            if self.checkForStop():
                keepSpidering = False

        return