Ejemplo n.º 1
0
def crawler():
    '''Function that crawls entire website starting from the categories page.
       Function creates a list of links from all the pages it had crawled.'''
    
    to_crawl = []
    crawled = []
    # amazon category page
    l = 'http://www.amazon.in/gp/site-directory/ref=nav_shopall_btn'
    to_crawl.append(s)
    h = urllib.request.urlopen(l)
    crawled.append(s)

    soup = BeautifulSoup(h)
    for links in soup.fnd_all('a', href=True):
        if links.get('class', 'nav_a'):
            li = 'http://www.amazon.com' + links
            if li not in crawled:
                to_crawl.append(li)
    return to_crawl