Exemple #1
0
def testGetLinksOnPage():
    # Test is fragile - assumes number of links on page won't change
    # With more time: create a mock website with known number of links
    # and don't change it.
    GOOGLE = 'https://gocardless.com'
    links = getLinksOnPage(GOOGLE)
    assert len(links) == 53
Exemple #2
0
 def crawl(self, domain, limit=100):
     visited = set()                    # initialise visited URLs
     queue   = deque([getURL(domain)])  # initialise queue
     trie    = Trie()                   # initialise trie
     while(queue and len(visited) < limit):
         link = queue.popleft()                  # next link
         if link not in visited and self.inDomain(link, domain):
             visited.add(link)                   # mark as seen
             queue.extend(getLinksOnPage(link))  # visit new links later
             trie.insert(self.formatURL(link))   # put in trie hierarchy
     return trie