def test_extractUrlsFromRobotsTxt_should_return_list(self): invalid_types = [None, "", list(), dict()] for invalid_type in invalid_types: with self.subTest(invalid_type=invalid_type): robots_txt = SpiderFootHelpers.extractUrlsFromRobotsTxt( invalid_type) self.assertIsInstance(robots_txt, list) robots_txt = SpiderFootHelpers.extractUrlsFromRobotsTxt("disallow:") self.assertIsInstance(robots_txt, list) self.assertFalse(robots_txt) robots_txt = SpiderFootHelpers.extractUrlsFromRobotsTxt( "disallow: /disallowed/path\n") self.assertIsInstance(robots_txt, list) self.assertIn("/disallowed/path", robots_txt)
def spiderFrom(self, startingPoint): keepSpidering = True totalFetched = 0 levelsTraversed = 0 nextLinks = dict() targetBase = SpiderFootHelpers.urlBaseUrl(startingPoint) # Are we respecting robots.txt? if self.opts['robotsonly'] and targetBase not in self.robotsRules: robotsTxt = self.sf.fetchUrl(targetBase + '/robots.txt', timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], verify=False) if robotsTxt['content'] is not None: self.debug('robots.txt contents: ' + robotsTxt['content']) self.robotsRules[targetBase] = SpiderFootHelpers.extractUrlsFromRobotsTxt(robotsTxt['content']) if self.checkForStop(): return # First iteration we are starting with links found on the start page # Iterations after that are based on links found on those pages, # and so on.. links = self.processUrl(startingPoint) # fetch first page if links is None: self.debug("No links found on the first fetch!") return while keepSpidering: # Gets hit in the second and subsequent iterations when more links # are found if len(nextLinks) > 0: links = dict() # Fetch content from the new links for link in nextLinks: # Always skip links we've already fetched if (link in self.fetchedPages): self.debug("Already fetched " + link + ", skipping.") continue # Check if we've been asked to stop if self.checkForStop(): return self.debug("Fetching fresh content from: " + link) time.sleep(self.opts['pausesec']) freshLinks = self.processUrl(link) if freshLinks is not None: links.update(freshLinks) totalFetched += 1 if totalFetched >= self.opts['maxpages']: self.info("Maximum number of pages (" + str(self.opts['maxpages']) + ") reached.") keepSpidering = False break nextLinks = self.cleanLinks(links) self.debug(f"Found links: {nextLinks}") # We've scanned through another layer of the site levelsTraversed += 1 self.debug(f"At level: {levelsTraversed}, Pages: {totalFetched}") if levelsTraversed >= self.opts['maxlevels']: self.info(f"Maximum number of levels ({self.opts['maxlevels']}) reached.") keepSpidering = False # We've reached the end of our journey.. if len(nextLinks) == 0: self.debug("No more links found to spider, finishing..") keepSpidering = False # We've been asked to stop scanning if self.checkForStop(): keepSpidering = False return