def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wu_being',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()  ######################
    crawl_queue.clear()  ######################
    crawl_queue.push(seed_url)  ######################
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()  ######################
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:  #############
                            # add this new link to queue######################
                            crawl_queue.push(normalize(
                                seed_url, link))  ######################
                crawl_queue.complete(url)  ######################

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:  ######################
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(
        ):  #######################
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Example #2
0
def thread_crawl(seed_url,
                 max_threads=10,
                 delay=5,
                 user_agent='Aurora-Twinkle',
                 proxies=None,
                 max_retries=1,
                 scrape_callback=None,
                 cache=None):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   max_retries=max_retries,
                   cache=cache)
    rp = get_robots(seed_url)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                if rp.can_fetch(user_agent, url):
                    html = D(url)
                    if scrape_callback:
                        try:
                            links = scrape_callback(url, html) or []
                        except Exception as e:
                            print("Error in callback for :{}:{}".format(
                                url, e))
                        else:
                            for link in links:
                                link = format_link(seed_url, link)
                                crawl_queue.push(link)
                    crawl_queue.complete(url)
                else:
                    print(
                        'user_agent: "' + user_agent +
                        '" Blocked by robots.txt:', url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)

        time.sleep(SLEEP_TIME)
Example #3
0
def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Download(cache=cache,
                 delay=delay,
                 user_agent=user_agent,
                 proxies=proxies,
                 num_retries=num_retries,
                 timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print(f'Error in callback for:{url}:{e}')
                    else:
                        for link in links:
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.start()
            threads.append(thread)

            time.sleep(SLEEP_TIME)
Example #4
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
def main(max_threads = 5):
    catlog_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()


    client = MongoClient('localhost', 27017, connect=False)
        #create collection to store cached webpages,
        # which is the equivalent of a table in a relational database
    db = client.cache
    cursor = db.books.find()

    urls = []
    while cursor.alive:
        temp = cursor.next()
        temp = temp['link']

        if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la':
            temp = '/novel' + temp[5:-4] + '/'
            temp = normalize(catlog_callback.seed_url, temp)
        elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com':
            temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/'

        print temp
        urls.append(temp)

    print urls[0]

    while True:
        now = datetime.now()

        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36')
            # every time finished, clear the job queue
            queue.clear()
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)
Example #6
0
def threaded_crawler(seed_url,
                     link_regex=None,
                     delay=1,
                     cache=None,
                     scrape_callback=None,
                     user_agent='Safari',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """
    多线程爬虫
    多个线程处理一个队列
    使用mongo作为队列
    """
    # crawl_queue = [seed_url]
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)

    # seen = set([seed_url])

    # 黑名单网站
    block_filename = os.path.join(BASEDIR, 'blocked_urls.txt')
    blocked_urls = [i.strip() for i in open(block_filename) if i.strip()] \
        if os.path.isfile(block_filename) else []
    # save_cache=False为测试需要
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout,
                   cache=cache,
                   save_cache=False,
                   blocked_urls=blocked_urls)

    def process_queue():
        while 1:
            try:
                url = crawl_queue.pop()
            except (IndexError, KeyError):
                # 队列为空则停止
                break
            else:
                html = D(url) if url else None
                if html and scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                        if link_regex:
                            links.extend(link for link in get_links(html)
                                         if re.match(link_regex, link))

                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            crawl_queue.push(link)  # 入列
                            # if link not in seen:
                            #     seen.add(link)
                # print html
                # if html:
                #     # 标记为已完成
                #     crawl_queue.complete(url)
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)