Beispiel #1
0
FILE = "/Users/Sun/Documents/IR/Data/HW3/pages/page"
FRONTIER_BACKUP = "/Users/Sun/Documents/IR/Data/HW3/pages/frontier"
# frontier.restore(open(FRONTIER_BACKUP))

crawled = 0
MIN_CRAWL = 35000

purl = None
DOMAIN_TIMESTAMP = {}

while not frontier.empty() and crawled < MIN_CRAWL:
    if crawled % 100 == 0:
        print str(crawled) + " pages crawled   " + str(
            crawled * 100 / MIN_CRAWL) + '%'
        ofile = open(FILE + str(crawled / 100), 'a')
        frontier.backup(open(FRONTIER_BACKUP, 'w'))

    url, inlinks = frontier.next_url()
    domain = Page.domain(url)
    now = datetime.datetime.now()
    print "Fetching " + url
    if domain in DOMAIN_TIMESTAMP:
        elasp = now - DOMAIN_TIMESTAMP[domain]
        ELASP_IN_SEC = elasp.total_seconds()
        if ELASP_IN_SEC < 1:
            sleep(1 - ELASP_IN_SEC)

    page = crawler.fetch(url, inlinks)
    DOMAIN_TIMESTAMP[domain] = now
    if page.fetched():
        outlinks = page.links()