def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wu_being', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() ###################### crawl_queue.clear() ###################### crawl_queue.push(seed_url) ###################### D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() ###################### except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: ############# # add this new link to queue###################### crawl_queue.push(normalize( seed_url, link)) ###################### crawl_queue.complete(url) ###################### # wait for all download threads to finish threads = [] while threads or crawl_queue: ###################### for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek( ): ####################### # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def thread_crawl(seed_url, max_threads=10, delay=5, user_agent='Aurora-Twinkle', proxies=None, max_retries=1, scrape_callback=None, cache=None): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, max_retries=max_retries, cache=cache) rp = get_robots(seed_url) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: if rp.can_fetch(user_agent, url): html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print("Error in callback for :{}:{}".format( url, e)) else: for link in links: link = format_link(seed_url, link) crawl_queue.push(link) crawl_queue.complete(url) else: print( 'user_agent: "' + user_agent + '" Blocked by robots.txt:', url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Download(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print(f'Error in callback for:{url}:{e}') else: for link in links: crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def main(max_threads = 5): catlog_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() client = MongoClient('localhost', 27017, connect=False) #create collection to store cached webpages, # which is the equivalent of a table in a relational database db = client.cache cursor = db.books.find() urls = [] while cursor.alive: temp = cursor.next() temp = temp['link'] if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la': temp = '/novel' + temp[5:-4] + '/' temp = normalize(catlog_callback.seed_url, temp) elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com': temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/' print temp urls.append(temp) print urls[0] while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36') # every time finished, clear the job queue queue.clear() else: print 'pass:' + str(now) pass time.sleep(3600)
def threaded_crawler(seed_url, link_regex=None, delay=1, cache=None, scrape_callback=None, user_agent='Safari', proxies=None, num_retries=1, max_threads=10, timeout=60): """ 多线程爬虫 多个线程处理一个队列 使用mongo作为队列 """ # crawl_queue = [seed_url] crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) # seen = set([seed_url]) # 黑名单网站 block_filename = os.path.join(BASEDIR, 'blocked_urls.txt') blocked_urls = [i.strip() for i in open(block_filename) if i.strip()] \ if os.path.isfile(block_filename) else [] # save_cache=False为测试需要 D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout, cache=cache, save_cache=False, blocked_urls=blocked_urls) def process_queue(): while 1: try: url = crawl_queue.pop() except (IndexError, KeyError): # 队列为空则停止 break else: html = D(url) if url else None if html and scrape_callback: try: links = scrape_callback(url, html) or [] if link_regex: links.extend(link for link in get_links(html) if re.match(link_regex, link)) except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) crawl_queue.push(link) # 入列 # if link not in seen: # seen.add(link) # print html # if html: # # 标记为已完成 # crawl_queue.complete(url) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)