def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wu_being',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()  ######################
    crawl_queue.clear()  ######################
    crawl_queue.push(seed_url)  ######################
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()  ######################
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:  #############
                            # add this new link to queue######################
                            crawl_queue.push(normalize(
                                seed_url, link))  ######################
                crawl_queue.complete(url)  ######################

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:  ######################
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(
        ):  #######################
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Ejemplo n.º 2
0
def threaded_crawler(delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    urllist = MongoQueue()  #查找是否有状态为0的数据,返回一个true或者false

    def process_queue():
        while True:
            # keep track that are processing url

            try:
                url = urllist.pop()
                print('url', url)
                D = Download()
                D.Downloader(url)
            except KeyError:
                # currently no urls to process
                break

    # wait for all download threads to finish
    threads = []
    while threads or urllist:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        print(urllist.peek() is True)
        if urllist.peek():
            while len(threads) < max_threads:
                # can start some more threads
                thread = threading.Thread(target=process_queue)
                thread.setDaemon(
                    True
                )  # set daemon so main thread can exit when receives ctrl-c
                thread.start()
                threads.append(thread)
        else:
            break
        time.sleep(SLEEP_TIME)
Ejemplo n.º 3
0
def thread_crawl(seed_url,
                 max_threads=10,
                 delay=5,
                 user_agent='Aurora-Twinkle',
                 proxies=None,
                 max_retries=1,
                 scrape_callback=None,
                 cache=None):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   max_retries=max_retries,
                   cache=cache)
    rp = get_robots(seed_url)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                if rp.can_fetch(user_agent, url):
                    html = D(url)
                    if scrape_callback:
                        try:
                            links = scrape_callback(url, html) or []
                        except Exception as e:
                            print("Error in callback for :{}:{}".format(
                                url, e))
                        else:
                            for link in links:
                                link = format_link(seed_url, link)
                                crawl_queue.push(link)
                    crawl_queue.complete(url)
                else:
                    print(
                        'user_agent: "' + user_agent +
                        '" Blocked by robots.txt:', url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)

        time.sleep(SLEEP_TIME)
Ejemplo n.º 4
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\
 user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    webpage_cache = MongoCache()
    # crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
            cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
            opener=None, cache=MongoCache())

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
            if (500 <= webpage_cache[url]['code'] <
                    600) | (webpage_cache[url]['code'] == -999):
                crawl_queue.reset(url)
            else:
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Ejemplo n.º 5
0
def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Download(cache=cache,
                 delay=delay,
                 user_agent=user_agent,
                 proxies=proxies,
                 num_retries=num_retries,
                 timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print(f'Error in callback for:{url}:{e}')
                    else:
                        for link in links:
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.start()
            threads.append(thread)

            time.sleep(SLEEP_TIME)
Ejemplo n.º 6
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)