Example #1
0
    def page_recvd((url, text)):
        try:
            spider.add_page(url, text)
        except SpiderIsFull:
            if not defer_fired:
                defer_fired.append(None)
                pages_d.callback(spider.get_crawled())
            return

        urls = spider.give_all_jobs()

        # draw a point
        sys.stdout.write('.')
        sys.stdout.flush()

        for url in urls:
            next_d = get_page(str(url), enc=encoding, timeout=timeout)
            next_d.addCallback(page_recvd)
            next_d.addErrback(on_err)
Example #2
0
    def page_recvd((url, text)):
        try:
            spider.add_page(url, text)
        except SpiderIsFull:
            if not defer_fired:
                defer_fired.append(None)
                pages_d.callback(spider.get_crawled())
            return

        urls = spider.give_all_jobs()

        # draw a point
        sys.stdout.write('.')
        sys.stdout.flush()

        for url in urls:
            next_d = get_page(str(url), enc=encoding, timeout=timeout)
            next_d.addCallback(page_recvd)
            next_d.addErrback(on_err)
Example #3
0
def crawl_pages(start_page, url_matcher, encoding, timeout, max_num, parser):
    pages_d = defer.Deferred()
    spider = PageSpider(url_matcher, max_num)
    spider.parser = parser

    def on_err(errobj):
        evalue = errobj.value
        if isinstance(evalue, tuple) and len(evalue) == 2:
            url, reason = evalue
            spider.fail_page(url)

    defer_fired = []

    def page_recvd((url, text)):
        try:
            spider.add_page(url, text)
        except SpiderIsFull:
            if not defer_fired:
                defer_fired.append(None)
                pages_d.callback(spider.get_crawled())
            return

        urls = spider.give_all_jobs()

        # draw a point
        sys.stdout.write('.')
        sys.stdout.flush()

        for url in urls:
            next_d = get_page(str(url), enc=encoding, timeout=timeout)
            next_d.addCallback(page_recvd)
            next_d.addErrback(on_err)

    # initial crawl
    d = get_page(start_page, enc=encoding, timeout=timeout, must_succ=True)
    d.addCallback(page_recvd)
    d.addErrback(on_err)

    return pages_d  # defer that will callback with pages
Example #4
0
def crawl_pages(start_page, url_matcher, encoding, timeout, max_num, parser):
    pages_d = defer.Deferred()
    spider = PageSpider(url_matcher, max_num)
    spider.parser = parser

    def on_err(errobj):
        evalue = errobj.value
        if isinstance(evalue, tuple) and len(evalue) == 2:
            url, reason = evalue
            spider.fail_page(url)

    defer_fired = []
    def page_recvd((url, text)):
        try:
            spider.add_page(url, text)
        except SpiderIsFull:
            if not defer_fired:
                defer_fired.append(None)
                pages_d.callback(spider.get_crawled())
            return

        urls = spider.give_all_jobs()

        # draw a point
        sys.stdout.write('.')
        sys.stdout.flush()

        for url in urls:
            next_d = get_page(str(url), enc=encoding, timeout=timeout)
            next_d.addCallback(page_recvd)
            next_d.addErrback(on_err)

    # initial crawl
    d = get_page(start_page, enc=encoding, timeout=timeout, must_succ=True)
    d.addCallback(page_recvd)
    d.addErrback(on_err)

    return pages_d # defer that will callback with pages