Python SpiderFootHelpers.extractUrlsFromRobotsTxt Beispiele

Programmiersprache: Python

Namespace / Paketname: spiderfoot

Klasse / Typ: SpiderFootHelpers

Methode / Funktion: extractUrlsFromRobotsTxt

Beispiele auf hotexamples.com: 2

Python SpiderFootHelpers.extractUrlsFromRobotsTxt - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die spiderfoot.SpiderFootHelpers.extractUrlsFromRobotsTxt, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

sanitiseInput(11)

extractEmailsFromText(10)

validEmail(8)

urlBaseUrl(7)

targetTypeFromString(7)

buildGraphData(5)

countryNameFromCountryCode(5)

dataParentChildToTree(5)

genScanInstanceId(4)

buildGraphGexf(3)

targetType(3)

loadModulesAsDict(3)

extractUrlsFromText(3)

extractHashesFromText(3)

extractUrlsFromRobotsTxt(2)

extractPgpKeysFromText(2)

extractIbansFromText(2)

logPath(2)

parseRobotsTxt(2)

extractCreditCardsFromText(2)

countryNameFromTld(2)

countryCodes(2)

urlBaseDir(2)

buildGraphJson(2)

validLEI(2)

loadCorrelationRulesRaw(1)

cachePath(1)

urlRelativeToAbsolute(1)

validPhoneNumber(1)

Beispiel #1

Datei anzeigen

Datei: test_spiderfoothelpers.py Projekt: klark1kent/spiderfoot

    def test_extractUrlsFromRobotsTxt_should_return_list(self):
        invalid_types = [None, "", list(), dict()]
        for invalid_type in invalid_types:
            with self.subTest(invalid_type=invalid_type):
                robots_txt = SpiderFootHelpers.extractUrlsFromRobotsTxt(
                    invalid_type)
                self.assertIsInstance(robots_txt, list)

        robots_txt = SpiderFootHelpers.extractUrlsFromRobotsTxt("disallow:")
        self.assertIsInstance(robots_txt, list)
        self.assertFalse(robots_txt)

        robots_txt = SpiderFootHelpers.extractUrlsFromRobotsTxt(
            "disallow: /disallowed/path\n")
        self.assertIsInstance(robots_txt, list)
        self.assertIn("/disallowed/path", robots_txt)

Beispiel #2

Datei anzeigen

    def spiderFrom(self, startingPoint):
        keepSpidering = True
        totalFetched = 0
        levelsTraversed = 0
        nextLinks = dict()
        targetBase = SpiderFootHelpers.urlBaseUrl(startingPoint)

        # Are we respecting robots.txt?
        if self.opts['robotsonly'] and targetBase not in self.robotsRules:
            robotsTxt = self.sf.fetchUrl(targetBase + '/robots.txt',
                                         timeout=self.opts['_fetchtimeout'],
                                         useragent=self.opts['_useragent'],
                                         verify=False)
            if robotsTxt['content'] is not None:
                self.debug('robots.txt contents: ' + robotsTxt['content'])
                self.robotsRules[targetBase] = SpiderFootHelpers.extractUrlsFromRobotsTxt(robotsTxt['content'])

        if self.checkForStop():
            return

        # First iteration we are starting with links found on the start page
        # Iterations after that are based on links found on those pages,
        # and so on..
        links = self.processUrl(startingPoint)  # fetch first page

        if links is None:
            self.debug("No links found on the first fetch!")
            return

        while keepSpidering:
            # Gets hit in the second and subsequent iterations when more links
            # are found
            if len(nextLinks) > 0:
                links = dict()

                # Fetch content from the new links
                for link in nextLinks:
                    # Always skip links we've already fetched
                    if (link in self.fetchedPages):
                        self.debug("Already fetched " + link + ", skipping.")
                        continue

                    # Check if we've been asked to stop
                    if self.checkForStop():
                        return

                    self.debug("Fetching fresh content from: " + link)
                    time.sleep(self.opts['pausesec'])
                    freshLinks = self.processUrl(link)
                    if freshLinks is not None:
                        links.update(freshLinks)

                    totalFetched += 1
                    if totalFetched >= self.opts['maxpages']:
                        self.info("Maximum number of pages (" + str(self.opts['maxpages'])
                                  + ") reached.")
                        keepSpidering = False
                        break

            nextLinks = self.cleanLinks(links)
            self.debug(f"Found links: {nextLinks}")

            # We've scanned through another layer of the site
            levelsTraversed += 1
            self.debug(f"At level: {levelsTraversed}, Pages: {totalFetched}")
            if levelsTraversed >= self.opts['maxlevels']:
                self.info(f"Maximum number of levels ({self.opts['maxlevels']}) reached.")
                keepSpidering = False

            # We've reached the end of our journey..
            if len(nextLinks) == 0:
                self.debug("No more links found to spider, finishing..")
                keepSpidering = False

            # We've been asked to stop scanning
            if self.checkForStop():
                keepSpidering = False

        return