Example #1
0
def crawl(start_url):
    cdb = CrawlDB(settings.DB_FILE)
    cdb.connect()
    cdb.enqueue([start_url])

    while True:
        url = cdb.dequeue()
        if url is False:
            break
        if cdb.hasCrawled(url):
            continue
        print url

        status = 0
        req = urllib2.Request(str(url))
        req.add_header("User-Agent", "couchmap 0.1")

        request = None

        try:
            request = urllib2.urlopen(req)
        except urllib2.URLError, e:
            continue
        except urllib2.HTTPError, e:
            status = e.code