def testGetLinksOnPage(): # Test is fragile - assumes number of links on page won't change # With more time: create a mock website with known number of links # and don't change it. GOOGLE = 'https://gocardless.com' links = getLinksOnPage(GOOGLE) assert len(links) == 53
def crawl(self, domain, limit=100): visited = set() # initialise visited URLs queue = deque([getURL(domain)]) # initialise queue trie = Trie() # initialise trie while(queue and len(visited) < limit): link = queue.popleft() # next link if link not in visited and self.inDomain(link, domain): visited.add(link) # mark as seen queue.extend(getLinksOnPage(link)) # visit new links later trie.insert(self.formatURL(link)) # put in trie hierarchy return trie